From dc1771f718548f7d4b93991b174c6e7b5e1ba410 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Mar 2025 22:24:14 -0700 Subject: Revert "drivers: core: synchronize really_probe() and dev_uevent()" This reverts commit c0a40097f0bc81deafc15f9195d1fb54595cd6d0. Probing a device can take arbitrary long time. In the field we observed that, for example, probing a bad micro-SD cards in an external USB card reader (or maybe cards were good but cables were flaky) sometimes takes longer than 2 minutes due to multiple retries at various levels of the stack. We can not block uevent_show() method for that long because udev is reading that attribute very often and that blocks udev and interferes with booting of the system. The change that introduced locking was concerned with dev_uevent() racing with unbinding the driver. However we can handle it without locking (which will be done in subsequent patch). There was also claim that synchronization with probe() is needed to properly load USB drivers, however this is a red herring: the change adding the lock was introduced in May of last year and USB loading and probing worked properly for many years before that. Revert the harmful locking. Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250311052417.1846985-1-dmitry.torokhov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index d2f9d3a59d6b..f9c1c623bca5 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2726,11 +2726,8 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr, if (!env) return -ENOMEM; - /* Synchronize with really_probe() */ - device_lock(dev); /* let the kset specific function add its keys */ retval = kset->uevent_ops->uevent(&dev->kobj, env); - device_unlock(dev); if (retval) goto out; -- cgit v1.2.3 From 04d3e5461c1f5cf8eec964ab64948ebed826e95e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Mar 2025 22:24:15 -0700 Subject: driver core: introduce device_set_driver() helper In preparation to closing a race when reading driver pointer in dev_uevent() code, instead of setting device->driver pointer directly introduce device_set_driver() helper. Signed-off-by: Dmitry Torokhov Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250311052417.1846985-2-dmitry.torokhov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/base.h | 6 ++++++ drivers/base/core.c | 2 +- drivers/base/dd.c | 7 +++---- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/base.h b/drivers/base/base.h index 0042e4774b0c..eb203cf8370b 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -180,6 +180,12 @@ int driver_add_groups(const struct device_driver *drv, const struct attribute_gr void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups); void device_driver_detach(struct device *dev); +static inline void device_set_driver(struct device *dev, const struct device_driver *drv) +{ + // FIXME - this cast should not be needed "soon" + dev->driver = (struct device_driver *)drv; +} + int devres_release_all(struct device *dev); void device_block_probing(void); void device_unblock_probing(void); diff --git a/drivers/base/core.c b/drivers/base/core.c index f9c1c623bca5..b000ee61c149 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3697,7 +3697,7 @@ done: device_pm_remove(dev); dpm_sysfs_remove(dev); DPMError: - dev->driver = NULL; + device_set_driver(dev, NULL); bus_remove_device(dev); BusError: device_remove_attrs(dev); diff --git a/drivers/base/dd.c b/drivers/base/dd.c index f0e4b4aba885..b526e0e0f52d 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -550,7 +550,7 @@ static void device_unbind_cleanup(struct device *dev) arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; - dev->driver = NULL; + device_set_driver(dev, NULL); dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); @@ -629,8 +629,7 @@ static int really_probe(struct device *dev, const struct device_driver *drv) } re_probe: - // FIXME - this cast should not be needed "soon" - dev->driver = (struct device_driver *)drv; + device_set_driver(dev, drv); /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); @@ -1014,7 +1013,7 @@ static int __device_attach(struct device *dev, bool allow_async) if (ret == 0) ret = 1; else { - dev->driver = NULL; + device_set_driver(dev, NULL); ret = 0; } } else { -- cgit v1.2.3 From 18daa52418e7e4629ed1703b64777294209d2622 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Mar 2025 22:24:16 -0700 Subject: driver core: fix potential NULL pointer dereference in dev_uevent() If userspace reads "uevent" device attribute at the same time as another threads unbinds the device from its driver, change to dev->driver from a valid pointer to NULL may result in crash. Fix this by using READ_ONCE() when fetching the pointer, and take bus' drivers klist lock to make sure driver instance will not disappear while we access it. Use WRITE_ONCE() when setting the driver pointer to ensure there is no tearing. Signed-off-by: Dmitry Torokhov Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250311052417.1846985-3-dmitry.torokhov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/base.h | 13 ++++++++++++- drivers/base/bus.c | 2 +- drivers/base/core.c | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/base.h b/drivers/base/base.h index eb203cf8370b..123031a757d9 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -73,6 +73,7 @@ static inline void subsys_put(struct subsys_private *sp) kset_put(&sp->subsys); } +struct subsys_private *bus_to_subsys(const struct bus_type *bus); struct subsys_private *class_to_subsys(const struct class *class); struct driver_private { @@ -182,8 +183,18 @@ void device_driver_detach(struct device *dev); static inline void device_set_driver(struct device *dev, const struct device_driver *drv) { + /* + * Majority (all?) read accesses to dev->driver happens either + * while holding device lock or in bus/driver code that is only + * invoked when the device is bound to a driver and there is no + * concern of the pointer being changed while it is being read. + * However when reading device's uevent file we read driver pointer + * without taking device lock (so we do not block there for + * arbitrary amount of time). We use WRITE_ONCE() here to prevent + * tearing so that READ_ONCE() can safely be used in uevent code. + */ // FIXME - this cast should not be needed "soon" - dev->driver = (struct device_driver *)drv; + WRITE_ONCE(dev->driver, (struct device_driver *)drv); } int devres_release_all(struct device *dev); diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 5ea3b03af9ba..5e75e1bce551 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -57,7 +57,7 @@ static int __must_check bus_rescan_devices_helper(struct device *dev, * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ -static struct subsys_private *bus_to_subsys(const struct bus_type *bus) +struct subsys_private *bus_to_subsys(const struct bus_type *bus) { struct subsys_private *sp = NULL; struct kobject *kobj; diff --git a/drivers/base/core.c b/drivers/base/core.c index b000ee61c149..cbc0099d8ef2 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2624,6 +2624,35 @@ static const char *dev_uevent_name(const struct kobject *kobj) return NULL; } +/* + * Try filling "DRIVER=" uevent variable for a device. Because this + * function may race with binding and unbinding the device from a driver, + * we need to be careful. Binding is generally safe, at worst we miss the + * fact that the device is already bound to a driver (but the driver + * information that is delivered through uevents is best-effort, it may + * become obsolete as soon as it is generated anyways). Unbinding is more + * risky as driver pointer is transitioning to NULL, so READ_ONCE() should + * be used to make sure we are dealing with the same pointer, and to + * ensure that driver structure is not going to disappear from under us + * we take bus' drivers klist lock. The assumption that only registered + * driver can be bound to a device, and to unregister a driver bus code + * will take the same lock. + */ +static void dev_driver_uevent(const struct device *dev, struct kobj_uevent_env *env) +{ + struct subsys_private *sp = bus_to_subsys(dev->bus); + + if (sp) { + scoped_guard(spinlock, &sp->klist_drivers.k_lock) { + struct device_driver *drv = READ_ONCE(dev->driver); + if (drv) + add_uevent_var(env, "DRIVER=%s", drv->name); + } + + subsys_put(sp); + } +} + static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { const struct device *dev = kobj_to_dev(kobj); @@ -2655,8 +2684,8 @@ static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) if (dev->type && dev->type->name) add_uevent_var(env, "DEVTYPE=%s", dev->type->name); - if (dev->driver) - add_uevent_var(env, "DRIVER=%s", dev->driver->name); + /* Add "DRIVER=%s" variable if the device is bound to a driver */ + dev_driver_uevent(dev, env); /* Add common DT information about the device */ of_device_uevent(dev, env); -- cgit v1.2.3 From 10076ae01388caffec3b90abcce05f24a9e4b15e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Apr 2025 15:28:46 +0300 Subject: drivers/base: Extend documentation with preferred way to use auxbus Document the preferred way to use auxiliary bus. Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/206e8c249f630abd3661deb36b84b26282241040.1743510317.git.leon@kernel.org [ reworded the text a bit - gregkh ] Signed-off-by: Greg Kroah-Hartman --- drivers/base/auxiliary.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index afa4df4c5a3f..95717d509ca9 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -156,6 +156,16 @@ * }, * .ops = my_custom_ops, * }; + * + * Please note that such custom ops approach is valid, but it is hard to implement + * it right without global locks per-device to protect from auxiliary_drv removal + * during call to that ops. In addition, this implementation lacks proper module + * dependency, which causes to load/unload races between auxiliary parent and devices + * modules. + * + * The most easiest way to provide these ops reliably without needing to + * have a lock is to EXPORT_SYMBOL*() them and rely on already existing + * modules infrastructure for validity and correct dependencies chains. */ static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, -- cgit v1.2.3 From bc2c46426f2d95e58c82f394531afdd034c8706c Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 14 Apr 2025 15:11:23 +0800 Subject: software node: Prevent link creation failure from causing kobj reference count imbalance syzbot reported a uaf in software_node_notify_remove. [1] When any of the two sysfs_create_link() in software_node_notify() fails, the swnode->kobj reference count will not increase normally, which will cause swnode to be released incorrectly due to the imbalance of kobj reference count when executing software_node_notify_remove(). Increase the reference count of kobj before creating the link to avoid uaf. [1] BUG: KASAN: slab-use-after-free in software_node_notify_remove+0x1bc/0x1c0 drivers/base/swnode.c:1108 Read of size 1 at addr ffff888033c08908 by task syz-executor105/5844 Freed by task 5844: software_node_notify_remove+0x159/0x1c0 drivers/base/swnode.c:1106 device_platform_notify_remove drivers/base/core.c:2387 [inline] Fixes: 9eb59204d519 ("iommufd/selftest: Add set_dev_pasid in mock iommu") Reported-by: syzbot+2ff22910687ee0dfd48e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2ff22910687ee0dfd48e Tested-by: syzbot+2ff22910687ee0dfd48e@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250414071123.1228331-1-lizhi.xu@windriver.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index b1726a3515f6..5c78fa6ae772 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -1080,6 +1080,7 @@ void software_node_notify(struct device *dev) if (!swnode) return; + kobject_get(&swnode->kobj); ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node"); if (ret) return; @@ -1089,8 +1090,6 @@ void software_node_notify(struct device *dev) sysfs_remove_link(&dev->kobj, "software_node"); return; } - - kobject_get(&swnode->kobj); } void software_node_notify_remove(struct device *dev) -- cgit v1.2.3 From b9792abb76ae1649080b8d48092c52e24c7bbdc2 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Apr 2025 22:51:10 +1000 Subject: drivers/base/memory: Avoid overhead from for_each_present_section_nr() for_each_present_section_nr() was introduced to add_boot_memory_block() by commit 61659efdb35c ("drivers/base/memory: improve add_boot_memory_block()"). It causes unnecessary overhead when the present sections are really sparse. next_present_section_nr() called by the macro to find the next present section, which is far away from the spanning sections in the specified block. Too much time consumed by next_present_section_nr() in this case, which can lead to softlockup as observed by Aditya Gupta on IBM Power10 machine. watchdog: BUG: soft lockup - CPU#248 stuck for 22s! [swapper/248:1] Modules linked in: CPU: 248 UID: 0 PID: 1 Comm: swapper/248 Not tainted 6.15.0-rc1-next-20250408 #1 VOLUNTARY Hardware name: 9105-22A POWER10 (raw) 0x800200 opal:v7.1-107-gfda75d121942 PowerNV NIP: c00000000209218c LR: c000000002092204 CTR: 0000000000000000 REGS: c00040000418fa30 TRAP: 0900 Not tainted (6.15.0-rc1-next-20250408) MSR: 9000000002009033 CR: 28000428 XER: 00000000 CFAR: 0000000000000000 IRQMASK: 0 GPR00: c000000002092204 c00040000418fcd0 c000000001b08100 0000000000000040 GPR04: 0000000000013e00 c000c03ffebabb00 0000000000c03fff c000400fff587f80 GPR08: 0000000000000000 00000000001196f7 0000000000000000 0000000028000428 GPR12: 0000000000000000 c000000002e80000 c00000000001007c 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR28: c000000002df7f70 0000000000013dc0 c0000000011dd898 0000000008000000 NIP [c00000000209218c] memory_dev_init+0x114/0x1e0 LR [c000000002092204] memory_dev_init+0x18c/0x1e0 Call Trace: [c00040000418fcd0] [c000000002092204] memory_dev_init+0x18c/0x1e0 (unreliable) [c00040000418fd50] [c000000002091348] driver_init+0x78/0xa4 [c00040000418fd70] [c0000000020063ac] kernel_init_freeable+0x22c/0x370 [c00040000418fde0] [c0000000000100a8] kernel_init+0x34/0x25c [c00040000418fe50] [c00000000000cd94] ret_from_kernel_user_thread+0x14/0x1c Avoid the overhead by folding for_each_present_section_nr() to the outer loop. add_boot_memory_block() is dropped after that. Fixes: 61659efdb35c ("drivers/base/memory: improve add_boot_memory_block()") Closes: https://lore.kernel.org/linux-mm/20250409180344.477916-1-adityag@linux.ibm.com Reported-by: Aditya Gupta Signed-off-by: Gavin Shan Acked-by: Oscar Salvador Tested-by: Aditya Gupta Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20250410125110.1232329-1-gshan@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/memory.c | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8f3a41d9bfaa..19469e7f88c2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -816,21 +816,6 @@ static int add_memory_block(unsigned long block_id, unsigned long state, return 0; } -static int __init add_boot_memory_block(unsigned long base_section_nr) -{ - unsigned long nr; - - for_each_present_section_nr(base_section_nr, nr) { - if (nr >= (base_section_nr + sections_per_block)) - break; - - return add_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, NULL, NULL); - } - - return 0; -} - static int add_hotplug_memory_block(unsigned long block_id, struct vmem_altmap *altmap, struct memory_group *group) @@ -957,7 +942,7 @@ static const struct attribute_group *memory_root_attr_groups[] = { void __init memory_dev_init(void) { int ret; - unsigned long block_sz, nr; + unsigned long block_sz, block_id, nr; /* Validate the configured memory block size */ block_sz = memory_block_size_bytes(); @@ -970,15 +955,23 @@ void __init memory_dev_init(void) panic("%s() failed to register subsystem: %d\n", __func__, ret); /* - * Create entries for memory sections that were found - * during boot and have been initialized + * Create entries for memory sections that were found during boot + * and have been initialized. Use @block_id to track the last + * handled block and initialize it to an invalid value (ULONG_MAX) + * to bypass the block ID matching check for the first present + * block so that it can be covered. */ - for (nr = 0; nr <= __highest_present_section_nr; - nr += sections_per_block) { - ret = add_boot_memory_block(nr); - if (ret) - panic("%s() failed to add memory block: %d\n", __func__, - ret); + block_id = ULONG_MAX; + for_each_present_section_nr(0, nr) { + if (block_id != ULONG_MAX && memory_block_id(nr) == block_id) + continue; + + block_id = memory_block_id(nr); + ret = add_memory_block(block_id, MEM_ONLINE, NULL, NULL); + if (ret) { + panic("%s() failed to add memory block: %d\n", + __func__, ret); + } } } -- cgit v1.2.3 From f95bbfe18512c5c018720468959edac056a17196 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Thu, 27 Feb 2025 10:49:30 -0800 Subject: drivers: base: handle module_kobject creation module_add_driver() relies on module_kset list for /sys/module//drivers directory creation. Since, commit 96a1a2412acba ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time") drivers which are initialized from subsys_initcall() or any other higher precedence initcall couldn't find the related kobject entry in the module_kset list because module_kset is not fully populated by the time module_add_driver() refers it. As a consequence, module_add_driver() returns early without calling make_driver_name(). Therefore, /sys/module//drivers is never created. Fix this issue by letting module_add_driver() handle module_kobject creation itself. Fixes: 96a1a2412acb ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time") Cc: stable@vger.kernel.org # requires all other patches from the series Suggested-by: Rasmus Villemoes Signed-off-by: Shyam Saini Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250227184930.34163-5-shyamsaini@linux.microsoft.com Signed-off-by: Petr Pavlu --- drivers/base/module.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/module.c b/drivers/base/module.c index 5bc71bea883a..218aaa096455 100644 --- a/drivers/base/module.c +++ b/drivers/base/module.c @@ -42,16 +42,13 @@ int module_add_driver(struct module *mod, const struct device_driver *drv) if (mod) mk = &mod->mkobj; else if (drv->mod_name) { - struct kobject *mkobj; - - /* Lookup built-in module entry in /sys/modules */ - mkobj = kset_find_obj(module_kset, drv->mod_name); - if (mkobj) { - mk = container_of(mkobj, struct module_kobject, kobj); + /* Lookup or create built-in module entry in /sys/modules */ + mk = lookup_or_create_module_kobject(drv->mod_name); + if (mk) { /* remember our module structure */ drv->p->mkobj = mk; - /* kset_find_obj took a reference */ - kobject_put(mkobj); + /* lookup_or_create_module_kobject took a reference */ + kobject_put(&mk->kobj); } } -- cgit v1.2.3 From e079d7c4db5cba1e8a315dc93030dfb6c7b49459 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Apr 2025 06:59:41 +0200 Subject: devtmpfs: don't use vfs_getattr_nosec to query i_mode The recent move of the bdev_statx call to the low-level vfs_getattr_nosec helper caused it being used by devtmpfs, which leads to deadlocks in md teardown due to the block device lookup and put interfering with the unusual lifetime rules in md. But as handle_remove only works on inodes created and owned by devtmpfs itself there is no need to use vfs_getattr_nosec vs simply reading the mode from the inode directly. Switch to that to avoid the bdev lookup or any other unintentional side effect. Reported-by: Shin'ichiro Kawasaki Reported-by: Xiao Ni Fixes: 777d0961ff95 ("fs: move the bdex_statx call to vfs_getattr_nosec") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250423045941.1667425-1-hch@lst.de Tested-by: Shin'ichiro Kawasaki Tested-by: Xiao Ni Tested-by: Ayush Jain Tested-by: Heiko Carstens Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- drivers/base/devtmpfs.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 6dd1a8860f1c..31bfb3194b4c 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -296,7 +296,7 @@ static int delete_path(const char *nodepath) return err; } -static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) +static int dev_mynode(struct device *dev, struct inode *inode) { /* did we create it */ if (inode->i_private != &thread) @@ -304,13 +304,13 @@ static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *sta /* does the dev_t match */ if (is_blockdev(dev)) { - if (!S_ISBLK(stat->mode)) + if (!S_ISBLK(inode->i_mode)) return 0; } else { - if (!S_ISCHR(stat->mode)) + if (!S_ISCHR(inode->i_mode)) return 0; } - if (stat->rdev != dev->devt) + if (inode->i_rdev != dev->devt) return 0; /* ours */ @@ -321,20 +321,16 @@ static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; - struct kstat stat; - struct path p; + struct inode *inode; int deleted = 0; - int err; + int err = 0; dentry = kern_path_locked(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); - p.mnt = parent.mnt; - p.dentry = dentry; - err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, - AT_STATX_SYNC_AS_STAT); - if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { + inode = d_inode(dentry); + if (dev_mynode(dev, inode)) { struct iattr newattrs; /* * before unlinking this node, reset permissions @@ -342,7 +338,7 @@ static int handle_remove(const char *nodename, struct device *dev) */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; - newattrs.ia_mode = stat.mode & ~0777; + newattrs.ia_mode = inode->i_mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); -- cgit v1.2.3 From 95deee37a12364f410d22c6a8383f59738a2fef3 Mon Sep 17 00:00:00 2001 From: Will McVicker Date: Thu, 24 Apr 2025 11:04:19 -0700 Subject: platform: Fix race condition during DMA configure at IOMMU probe time To avoid a race between the IOMMU probing thread and the device driver async probing thread during configuration of the platform DMA, update `platform_dma_configure()` to read `dev->driver` once and test if it's NULL before using it. This ensures that we don't de-reference an invalid platform driver pointer if the device driver is asynchronously bound while configuring the DMA. Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path") Signed-off-by: Will McVicker Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20250424180420.3928523-1-willmcvicker@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 1813cfd0c4bd..cfccf3ff36e7 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1440,7 +1440,7 @@ static void platform_shutdown(struct device *_dev) static int platform_dma_configure(struct device *dev) { - struct platform_driver *drv = to_platform_driver(dev->driver); + struct device_driver *drv = READ_ONCE(dev->driver); struct fwnode_handle *fwnode = dev_fwnode(dev); enum dev_dma_attr attr; int ret = 0; @@ -1451,8 +1451,8 @@ static int platform_dma_configure(struct device *dev) attr = acpi_get_dma_attr(to_acpi_device_node(fwnode)); ret = acpi_dma_configure(dev, attr); } - /* @drv may not be valid when we're called from the IOMMU layer */ - if (ret || !dev->driver || drv->driver_managed_dma) + /* @dev->driver may not be valid when we're called from the IOMMU layer */ + if (ret || !drv || to_platform_driver(drv)->driver_managed_dma) return ret; ret = iommu_device_use_default_domain(dev); -- cgit v1.2.3 From f4818881c47fd91fcb6d62373c57c7844e3de1c0 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 20:23:23 -0700 Subject: x86/its: Enable Indirect Target Selection mitigation Indirect Target Selection (ITS) is a bug in some pre-ADL Intel CPUs with eIBRS. It affects prediction of indirect branch and RETs in the lower half of cacheline. Due to ITS such branches may get wrongly predicted to a target of (direct or indirect) branch that is located in the upper half of the cacheline. Scope of impact =============== Guest/host isolation -------------------- When eIBRS is used for guest/host isolation, the indirect branches in the VMM may still be predicted with targets corresponding to branches in the guest. Intra-mode ---------- cBPF or other native gadgets can be used for intra-mode training and disclosure using ITS. User/kernel isolation --------------------- When eIBRS is enabled user/kernel isolation is not impacted. Indirect Branch Prediction Barrier (IBPB) ----------------------------------------- After an IBPB, indirect branches may be predicted with targets corresponding to direct branches which were executed prior to IBPB. This is mitigated by a microcode update. Add cmdline parameter indirect_target_selection=off|on|force to control the mitigation to relocate the affected branches to an ITS-safe thunk i.e. located in the upper half of cacheline. Also add the sysfs reporting. When retpoline mitigation is deployed, ITS safe-thunks are not needed, because retpoline sequence is already ITS-safe. Similarly, when call depth tracking (CDT) mitigation is deployed (retbleed=stuff), ITS safe return thunk is not used, as CDT prevents RSB-underflow. To not overcomplicate things, ITS mitigation is not supported with spectre-v2 lfence;jmp mitigation. Moreover, it is less practical to deploy lfence;jmp mitigation on ITS affected parts anyways. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- Documentation/ABI/testing/sysfs-devices-system-cpu | 1 + Documentation/admin-guide/kernel-parameters.txt | 13 ++ arch/x86/kernel/cpu/bugs.c | 140 ++++++++++++++++++++- drivers/base/cpu.c | 3 + include/linux/cpu.h | 2 + 5 files changed, 155 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 206079d3bd5b..6a1acabb29d8 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -511,6 +511,7 @@ Description: information about CPUs heterogeneity. What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/gather_data_sampling + /sys/devices/system/cpu/vulnerabilities/indirect_target_selection /sys/devices/system/cpu/vulnerabilities/itlb_multihit /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d9fd26b95b34..d3e78e795e0d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2202,6 +2202,18 @@ different crypto accelerators. This option can be used to achieve best performance for particular HW. + indirect_target_selection= [X86,Intel] Mitigation control for Indirect + Target Selection(ITS) bug in Intel CPUs. Updated + microcode is also required for a fix in IBPB. + + on: Enable mitigation (default). + off: Disable mitigation. + force: Force the ITS bug and deploy default + mitigation. + + For details see: + Documentation/admin-guide/hw-vuln/indirect-target-selection.rst + init= [KNL] Format: Run specified binary instead of /sbin/init as init @@ -3693,6 +3705,7 @@ expose users to several CPU vulnerabilities. Equivalent to: if nokaslr then kpti=0 [ARM64] gather_data_sampling=off [X86] + indirect_target_selection=off [X86] kvm.nx_huge_pages=off [X86] l1tf=off [X86] mds=off [X86] diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 362602b705cc..417596ce5352 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -49,6 +49,7 @@ static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); static void __init gds_select_mitigation(void); +static void __init its_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -66,6 +67,14 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { @@ -178,6 +187,7 @@ void __init cpu_select_mitigations(void) */ srso_select_mitigation(); gds_select_mitigation(); + its_select_mitigation(); } /* @@ -1118,7 +1128,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1153,7 +1163,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1187,6 +1197,115 @@ do_cmd_auto: pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +enum its_mitigation_cmd { + ITS_CMD_OFF, + ITS_CMD_ON, +}; + +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS; + +static enum its_mitigation_cmd its_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_cmd = ITS_CMD_OFF; + } else if (!strcmp(str, "on")) { + its_cmd = ITS_CMD_ON; + } else if (!strcmp(str, "force")) { + its_cmd = ITS_CMD_ON; + setup_force_cpu_bug(X86_BUG_ITS); + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + enum its_mitigation_cmd cmd = its_cmd; + + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + /* Retpoline+CDT mitigates ITS, bail out */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + boot_cpu_has(X86_FEATURE_CALL_DEPTH)) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + goto out; + } + + /* Exit early to avoid irrelevant warnings */ + if (cmd == ITS_CMD_OFF) { + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (spectre_v2_enabled == SPECTRE_V2_NONE) { + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + + switch (cmd) { + case ITS_CMD_OFF: + its_mitigation = ITS_MITIGATION_OFF; + break; + case ITS_CMD_ON: + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + } +out: + pr_info("%s\n", its_strings[its_mitigation]); +} + #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt @@ -2607,10 +2726,10 @@ static void __init srso_select_mitigation(void) if (boot_cpu_data.x86 == 0x19) { setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; + set_return_thunk(srso_alias_return_thunk); } else { setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; + set_return_thunk(srso_return_thunk); } if (has_microcode) srso_mitigation = SRSO_MITIGATION_SAFE_RET; @@ -2800,6 +2919,11 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2982,6 +3106,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -3061,6 +3188,11 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif void __warn_thunk(void) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index a7e511849875..50651435577c 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -600,6 +600,7 @@ CPU_SHOW_VULN_FALLBACK(spec_rstack_overflow); CPU_SHOW_VULN_FALLBACK(gds); CPU_SHOW_VULN_FALLBACK(reg_file_data_sampling); CPU_SHOW_VULN_FALLBACK(ghostwrite); +CPU_SHOW_VULN_FALLBACK(indirect_target_selection); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -616,6 +617,7 @@ static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NU static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL); static DEVICE_ATTR(reg_file_data_sampling, 0444, cpu_show_reg_file_data_sampling, NULL); static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL); +static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -633,6 +635,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_gather_data_sampling.attr, &dev_attr_reg_file_data_sampling.attr, &dev_attr_ghostwrite.attr, + &dev_attr_indirect_target_selection.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e3049543008b..3aa955102b34 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,6 +78,8 @@ extern ssize_t cpu_show_gds(struct device *dev, extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_indirect_target_selection(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3