aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorMing Lei <ming.lei@redhat.com>2025-08-30 10:18:21 +0800
committerJens Axboe <axboe@kernel.dk>2025-09-08 08:05:32 -0600
commitad0d05dbddc1bf86e92220fea873176de6b12f78 (patch)
tree11bfd553a466ae9234b5ee53c3f228710665e765 /block
parentblk-mq: Pass tag_set to blk_mq_free_rq_map/tags (diff)
downloadlinux-ad0d05dbddc1bf86e92220fea873176de6b12f78.tar.gz
linux-ad0d05dbddc1bf86e92220fea873176de6b12f78.zip
blk-mq: Defer freeing of tags page_list to SRCU callback
Tag iterators can race with the freeing of the request pages(tags->page_list), potentially leading to use-after-free issues. Defer the freeing of the page list and the tags structure itself until after an SRCU grace period has passed. This ensures that any concurrent tag iterators have completed before the memory is released. With this way, we can replace the big tags->lock in tags iterator code path with srcu for solving the issue. This is achieved by: - Adding a new `srcu_struct tags_srcu` to `blk_mq_tag_set` to protect tag map iteration. - Adding an `rcu_head` to `struct blk_mq_tags` to be used with `call_srcu`. - Moving the page list freeing logic and the `kfree(tags)` call into a new callback function, `blk_mq_free_tags_callback`. - In `blk_mq_free_tags`, invoking `call_srcu` to schedule the new callback for deferred execution. The read-side protection for the tag iterators will be added in a subsequent patch. Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r--block/blk-mq-tag.c24
-rw-r--r--block/blk-mq.c26
2 files changed, 36 insertions, 14 deletions
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index f09a4cbe486f..3c2ec6e86d54 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -8,6 +8,9 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/kmemleak.h>
#include <linux/delay.h>
#include "blk.h"
@@ -576,11 +579,30 @@ out_free_tags:
return NULL;
}
+static void blk_mq_free_tags_callback(struct rcu_head *head)
+{
+ struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags,
+ rcu_head);
+ struct page *page;
+
+ while (!list_empty(&tags->page_list)) {
+ page = list_first_entry(&tags->page_list, struct page, lru);
+ list_del_init(&page->lru);
+ /*
+ * Remove kmemleak object previously allocated in
+ * blk_mq_alloc_rqs().
+ */
+ kmemleak_free(page_address(page));
+ __free_pages(page, page->private);
+ }
+ kfree(tags);
+}
+
void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
{
sbitmap_queue_free(&tags->bitmap_tags);
sbitmap_queue_free(&tags->breserved_tags);
- kfree(tags);
+ call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback);
}
int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5efa0712aac7..e1b44173029c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3454,7 +3454,6 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
struct blk_mq_tags *drv_tags;
- struct page *page;
if (list_empty(&tags->page_list))
return;
@@ -3478,17 +3477,10 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
blk_mq_clear_rq_mapping(drv_tags, tags);
-
- while (!list_empty(&tags->page_list)) {
- page = list_first_entry(&tags->page_list, struct page, lru);
- list_del_init(&page->lru);
- /*
- * Remove kmemleak object previously allocated in
- * blk_mq_alloc_rqs().
- */
- kmemleak_free(page_address(page));
- __free_pages(page, page->private);
- }
+ /*
+ * Free request pages in SRCU callback, which is called from
+ * blk_mq_free_tags().
+ */
}
void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
@@ -4834,6 +4826,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_srcu;
}
+ ret = init_srcu_struct(&set->tags_srcu);
+ if (ret)
+ goto out_cleanup_srcu;
init_rwsem(&set->update_nr_hwq_lock);
@@ -4842,7 +4837,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node);
if (!set->tags)
- goto out_cleanup_srcu;
+ goto out_cleanup_tags_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
@@ -4871,6 +4866,8 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_tags_srcu:
+ cleanup_srcu_struct(&set->tags_srcu);
out_cleanup_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(set->srcu);
@@ -4916,6 +4913,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+
+ srcu_barrier(&set->tags_srcu);
+ cleanup_srcu_struct(&set->tags_srcu);
if (set->flags & BLK_MQ_F_BLOCKING) {
cleanup_srcu_struct(set->srcu);
kfree(set->srcu);