From 0b91d06aeade41f83d44e5642b51ae39377a97ba Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 15:04:40 +0200 Subject: [PATCH 01/47] [RFC] fuse: Set request unique on allocation This is especially needed for better ftrace analysis, for example to build histograms. So far the request unique was missing, because it was added after the first trace message. IDs/req-unique now might not come up perfectly sequentially anymore, but especially with cloned device or io-uring this did not work perfectly anyway. Signed-off-by: Bernd Schubert (imported from commit 44158921dd11a6654a1935dfb2ad8af953f7ada1) --- fs/fuse/dev.c | 8 +++----- fs/fuse/dev_uring.c | 3 --- fs/fuse/virtio_fs.c | 3 --- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 612d4da6d7d914..07d3de0e6f1083 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -374,8 +374,6 @@ static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -639,6 +637,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; if (args->end) __set_bit(FR_ASYNC, &req->flags); + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(&req->fm->fc->iq); } ssize_t __fuse_simple_request(struct mnt_idmap *idmap, @@ -686,9 +687,6 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap, static bool fuse_request_queue_background_uring(struct fuse_conn *fc, struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; - - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1cc..961d7963157407 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1268,9 +1268,6 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); - spin_lock(&queue->lock); err = -ENOTCONN; if (unlikely(queue->stopped)) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 38051e5fba19ba..fc479e9aef9599 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1482,9 +1482,6 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); - clear_bit(FR_PENDING, &req->flags); fs = fiq->priv; From dbdeb21073cef69d29858b5db30675c9ab8e7b2b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 16:38:44 +0200 Subject: [PATCH 02/47] fuse: {io-uring} Avoid _send code dup fuse_uring_send_next_to_ring() can just call into fuse_uring_send and avoid code dup. Signed-off-by: Bernd Schubert (imported from commit 9efaa8dfc77901a8b0cf24f65de5963ef23f170e) --- fs/fuse/dev_uring.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 961d7963157407..9129e72e4fa167 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -709,6 +709,20 @@ static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, return err; } +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move_tail(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + /* * Write data to the ring buffer and send the request to userspace, * userspace will read it @@ -718,22 +732,13 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, struct fuse_req *req, unsigned int issue_flags) { - struct fuse_ring_queue *queue = ent->queue; int err; - struct io_uring_cmd *cmd; err = fuse_uring_prepare_send(ent, req); if (err) return err; - spin_lock(&queue->lock); - cmd = ent->cmd; - ent->cmd = NULL; - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - spin_unlock(&queue->lock); - - io_uring_cmd_done(cmd, 0, 0, issue_flags); + fuse_uring_send(ent, ent->cmd, 0, issue_flags); return 0; } @@ -1189,20 +1194,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; } -static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, - ssize_t ret, unsigned int issue_flags) -{ - struct fuse_ring_queue *queue = ent->queue; - - spin_lock(&queue->lock); - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - ent->cmd = NULL; - spin_unlock(&queue->lock); - - io_uring_cmd_done(cmd, ret, 0, issue_flags); -} - /* * This prepares and sends the ring request in fuse-uring task context. * User buffers are not mapped yet - the application does not have permission From e3a1dbc6cb1db91296f6e5fb4ff5800b7670b930 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 16:17:23 +0200 Subject: [PATCH 03/47] fuse: fine-grained request ftraces Rename trace_fuse_request_send to trace_fuse_request_enqueue Add trace_fuse_request_send Add trace_fuse_request_bg_enqueue Add trace_fuse_request_enqueue This helps to track entire request time and time in different queues. Signed-off-by: Bernd Schubert (imported from commit 4a7f14274fc223e50b36f428e1b6acd661b73f53) --- fs/fuse/dev.c | 7 +++++- fs/fuse/dev_uring.c | 2 ++ fs/fuse/fuse_trace.h | 51 +++++++++++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 07d3de0e6f1083..171b9c57b841db 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -396,7 +396,9 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); + + /* enqueue, as it is send to "fiq->ops queue" */ + trace_fuse_request_enqueue(req); fiq->ops->send_req(fiq, req); } @@ -711,6 +713,8 @@ static int fuse_request_queue_background(struct fuse_req *req) } __set_bit(FR_ISREPLY, &req->flags); + trace_fuse_request_bg_enqueue(req); + #ifdef CONFIG_FUSE_IO_URING if (fuse_uring_ready(fc)) return fuse_request_queue_background_uring(fc, req); @@ -1446,6 +1450,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); spin_unlock(&fiq->lock); + trace_fuse_request_send(req); args = req->args; reqsize = req->in.h.len; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 9129e72e4fa167..477ef8749fe991 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,6 +7,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include #include @@ -720,6 +721,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); + trace_fuse_request_send(ent->fuse_req); io_uring_cmd_done(cmd, ret, 0, issue_flags); } diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index bbe9ddd8c71696..393c630e772635 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -77,30 +77,55 @@ OPCODES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -TRACE_EVENT(fuse_request_send, +#define FUSE_REQ_TRACE_FIELDS \ + __field(dev_t, connection) \ + __field(uint64_t, unique) \ + __field(enum fuse_opcode, opcode) \ + __field(uint32_t, len) \ + +#define FUSE_REQ_TRACE_ASSIGN(req) \ + do { \ + __entry->connection = req->fm->fc->dev; \ + __entry->unique = req->in.h.unique; \ + __entry->opcode = req->in.h.opcode; \ + __entry->len = req->in.h.len; \ + } while (0) + + +TRACE_EVENT(fuse_request_enqueue, TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_bg_enqueue, + TP_PROTO(const struct fuse_req *req), TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), - TP_STRUCT__entry( - __field(dev_t, connection) - __field(uint64_t, unique) - __field(enum fuse_opcode, opcode) - __field(uint32_t, len) - ), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); - TP_fast_assign( - __entry->connection = req->fm->fc->dev; - __entry->unique = req->in.h.unique; - __entry->opcode = req->in.h.opcode; - __entry->len = req->in.h.len; - ), +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), TP_printk("connection %u req %llu opcode %u (%s) len %u ", __entry->connection, __entry->unique, __entry->opcode, __print_symbolic(__entry->opcode, OPCODES), __entry->len) ); + TRACE_EVENT(fuse_request_end, TP_PROTO(const struct fuse_req *req), From 3bfca20d38bef404a44c0df6626ec5f4c1eb3aba Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 8 Jan 2025 16:10:27 +0100 Subject: [PATCH 04/47] fuse: {uring} Pin the user buffer This is to allow copying into the buffer from the application without the need to copy in ring context (and with that, the need that the ring task is active in kernel space). Signed-off-by: Bernd Schubert (cherry picked from commit 43d1a63dec17d928609fb9725ac4ab9d6e09803f) (imported from commit ea01f94a55f91fa48cb3a0304b1e41a92707539a) --- fs/fuse/dev.c | 9 ++ fs/fuse/dev_uring.c | 229 +++++++++++++++++++++++++++++++++++++++--- fs/fuse/dev_uring_i.h | 4 + fs/fuse/fuse_dev_i.h | 2 + 4 files changed, 228 insertions(+), 16 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 171b9c57b841db..350f11334668db 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -895,6 +895,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs++; } + } else if (cs->ring.pages) { + cs->pg = cs->ring.pages[cs->ring.page_idx++]; + /* + * non stricly needed, just to avoid a uring exception in + * fuse_copy_finish + */ + get_page(cs->pg); + cs->len = PAGE_SIZE; + cs->offset = 0; } else { size_t off; err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 477ef8749fe991..546808dbd546a4 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -11,6 +11,7 @@ #include #include +#include static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); @@ -18,7 +19,23 @@ MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ +#define FUSE_RING_HEADER_PG 0 +#define FUSE_RING_PAYLOAD_PG 1 +/* redfs only to allow patch backports */ +#define IO_URING_F_TASK_DEAD (1 << 13) + +#ifndef io_uring_cmd_to_pdu +static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) +{ + BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu)); +} +/* red specific backport */ +#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \ + io_uring_cmd_private_sz_check(sizeof(pdu_type)), \ + ((pdu_type *)&(cmd)->pdu) \ +) +#endif bool fuse_uring_enabled(void) { @@ -184,6 +201,21 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) return false; } +/* + * Copy from memmap.c, should be exported + */ +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -207,6 +239,9 @@ void fuse_uring_destruct(struct fuse_conn *fc) list_for_each_entry_safe(ent, next, &queue->ent_released, list) { list_del_init(&ent->list); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + io_pages_free(&ent->payload_pages, + ent->nr_payload_pages); kfree(ent); } @@ -597,13 +632,67 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, fuse_copy_init(&cs, false, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); } - /* - * Copy data from the req to the ring buffer - */ +/* + * Copy data from the req to the ring buffer + * In order to be able to write into the ring buffer from the application, + * i.e. to avoid io_uring_cmd_complete_in_task(), the header needs to be + * pinned as well. + */ +static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent, + struct fuse_uring_req_header *headers) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + fuse_copy_init(&cs, 1, NULL); + cs.is_uring = 1; + cs.req = req; + cs.ring.pages = ent->payload_pages; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + memcpy(&headers->op_in, in_args->value, in_args->size); + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + return err; +} + +/* + * Copy data from the req to the ring buffer + */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { @@ -627,6 +716,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_init(&cs, true, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; if (num_args > 0) { /* @@ -666,6 +757,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; int err; + struct fuse_uring_req_header *headers = NULL; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { @@ -678,22 +770,29 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, if (WARN_ON(req->in.h.unique == 0)) return err; - /* copy the request */ - err = fuse_uring_args_to_ring(ring, req, ent); - if (unlikely(err)) { - pr_info_ratelimited("Copy to ring failed: %d\n", err); - return err; - } - /* copy fuse_in_header */ - err = copy_to_user(&ent->headers->in_out, &req->in.h, - sizeof(req->in.h)); - if (err) { - err = -EFAULT; - return err; + if (ent->header_pages) { + headers = kmap_local_page( + ent->header_pages[FUSE_RING_HEADER_PG]); + + memcpy(&headers->in_out, &req->in.h, sizeof(req->in.h)); + + err = fuse_uring_args_to_ring_pages(ring, req, ent, headers); + kunmap_local(headers); + } else { + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) + err = -EFAULT; } - return 0; + return err; } static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, @@ -1007,6 +1106,45 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, } } +/* + * Copy from memmap.c, should be exported there + */ +static struct page **io_pin_pages(unsigned long uaddr, unsigned long len, + int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + /* * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] * the payload @@ -1033,6 +1171,59 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, return 0; } +static int fuse_uring_pin_pages(struct fuse_ring_ent *ent) +{ + struct fuse_ring *ring = ent->queue->ring; + int err; + + /* + * This needs to do locked memory accounting, for now privileged servers + * only. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* Pin header pages */ + if (!PAGE_ALIGNED(ent->headers)) { + pr_info_ratelimited("ent->headers is not page-aligned: %p\n", + ent->headers); + return -EINVAL; + } + + ent->header_pages = io_pin_pages((unsigned long)ent->headers, + sizeof(struct fuse_uring_req_header), + &ent->nr_header_pages); + if (IS_ERR(ent->header_pages)) { + err = PTR_ERR(ent->header_pages); + pr_info_ratelimited("Failed to pin header pages, err=%d\n", + err); + ent->header_pages = NULL; + return err; + } + + if (ent->nr_header_pages != 1) { + pr_info_ratelimited("Header pages not pinned as one page\n"); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->header_pages = NULL; + return -EINVAL; + } + + /* Pin payload pages */ + ent->payload_pages = io_pin_pages((unsigned long)ent->payload, + ring->max_payload_sz, + &ent->nr_payload_pages); + if (IS_ERR(ent->payload_pages)) { + err = PTR_ERR(ent->payload_pages); + pr_info_ratelimited("Failed to pin payload pages, err=%d\n", + err); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->payload_pages = NULL; + return err; + } + + return 0; +} + static struct fuse_ring_ent * fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_queue *queue) @@ -1074,6 +1265,12 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + err = fuse_uring_pin_pages(ent); + if (err) { + kfree(ent); + return ERR_PTR(err); + } + atomic_inc(&ring->queue_refs); return ent; } diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 51a563922ce141..c89c7dc27c76c1 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -40,7 +40,11 @@ enum fuse_ring_req_state { struct fuse_ring_ent { /* userspace buffer */ struct fuse_uring_req_header __user *headers; + struct page **header_pages; + int nr_header_pages; void __user *payload; + struct page **payload_pages; + int nr_payload_pages; /* the ring queue that owns the request */ struct fuse_ring_queue *queue; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a3193d..1e91079947f82b 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -34,6 +34,8 @@ struct fuse_copy_state { bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ + struct page **pages; + int page_idx; } ring; }; From c211946c4c90f86651dc2a57112d3d2bb5b3a9cd Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 17 Jan 2025 22:06:30 +0100 Subject: [PATCH 05/47] fuse: {io-uring] Avoid complete-in-task if pinned pages are used If pinned pages are used the application can write into these pages and io_uring_cmd_complete_in_task() is not needed. Signed-off-by: Bernd Schubert (imported from commit 5f0264c1dc0100e274f3db37511bba0d8043de1c) --- fs/fuse/dev_uring.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 546808dbd546a4..544decbc123c64 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1436,12 +1436,31 @@ static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) return queue; } -static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) { struct io_uring_cmd *cmd = ent->cmd; - uring_cmd_set_ring_ent(cmd, ent); - io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + /* + * Task needed when pages are not pinned as the application doing IO + * is not allowed to write into fuse-server pages. + * Additionally for IO through io-uring as issue flags are unknown then. + * backgrounds requests might hold spin-locks, that conflict with + * io_uring_cmd_done() mutex lock. + */ + if (!ent->header_pages || current->io_uring || bg) { + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + } else { + int err = fuse_uring_prepare_send(ent, ent->fuse_req); + struct fuse_ring_queue *queue = ent->queue; + + if (err) { + fuse_uring_next_fuse_req(ent, queue, + IO_URING_F_UNLOCKED); + return; + } + fuse_uring_send(ent, cmd, 0, IO_URING_F_UNLOCKED); + } } /* queue a fuse request and send it if a ring entry is available */ @@ -1474,7 +1493,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) spin_unlock(&queue->lock); if (ent) - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, false); return; @@ -1527,7 +1546,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fuse_uring_add_req_to_ring_ent(ent, req); spin_unlock(&queue->lock); - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, true); } else { spin_unlock(&queue->lock); } From 358d6742a44b9f92ebcf58a1b0cc27945a2906c0 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 7 May 2025 23:39:00 +0200 Subject: [PATCH 06/47] fuse: Use fuser-server provided read-ahead for CAP_SYS_ADMIN readhead is currently limited to bdi->ra_pages. One can change that after the mount with something like minor=$(stat -c "%d" /path/to/fuse) echo 1024 > /sys/class/bdi/0:$(minor)/read_ahead_kb Issue is that fuse-server cannot do that from its ->init method, as it has to know about device minor, which blocks before init is complete. Fuse already sets the bdi value, but upper limit is the current bdi value. For CAP_SYS_ADMIN we can allow higher values. Signed-off-by: Bernd Schubert (imported from commit 763c96da4bd6d1bb95d8e6bb7fd352389f3a17b9) --- fs/fuse/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7c0403a002e759..0a00b1c1b44018 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1448,7 +1448,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, init_server_timeout(fc, timeout); - fm->sb->s_bdi->ra_pages = + if (CAP_SYS_ADMIN) + fm->sb->s_bdi->ra_pages = ra_pages; + else + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; From eb4e2d4bc74da1940f959bca16e57dbc3a2e01a2 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 8 Apr 2025 16:44:55 +0200 Subject: [PATCH 07/47] fuse: Increase the default max pages limit to 8182 Due to user buffer misalignent we actually need one page more, i.e. 1025 instead of 1024, will be handled differently. For now we just bump up the max. (imported from commit 3f71501c9c4702ba976145ff15c4a053ecd1a3ee) --- fs/fuse/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0a00b1c1b44018..b4d724336f2719 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -37,7 +37,8 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); -unsigned int fuse_max_pages_limit = 256; +unsigned int fuse_max_pages_limit = 4097; + /* default is no timeout */ unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; From d4a733cc681f7d5674c059e9c52a80f6030abb59 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 20 Jun 2025 17:34:53 +0200 Subject: [PATCH 08/47] fuse: add DLM_LOCK opcode When having writeback cache enabled it is beneficial for data consistency to communicate to the FUSE server when the kernel prepares a page for caching. This lets the FUSE server react and lock the page. Additionally the kernel lets the FUSE server decide how much data it locks by the same call and keeps the given information in the dlm lock management. If the feature is not supported it will be disabled after first unsuccessful use. - Add DLM_LOCK fuse opcode - Add cache page lock caching for writeback cache functionality. This means sending out a FUSE call whenever the kernel prepares a page for writeback cache. The kernel will manage the cache so that it will keep track of already acquired locks. (except for the case that is documented in the code) - Use rb-trees for the management of the already 'locked' page ranges - Use rw_semaphore for synchronization in fuse_dlm_cache (imported from commit 287c8840b60d5cdcf806b16e8cc5722f2dbf0738) --- fs/fuse/Makefile | 2 +- fs/fuse/dir.c | 6 + fs/fuse/file.c | 25 +- fs/fuse/fuse_dlm_cache.c | 551 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_dlm_cache.h | 50 ++++ fs/fuse/fuse_i.h | 18 ++ fs/fuse/fuse_trace.h | 1 + fs/fuse/inode.c | 11 + include/uapi/linux/fuse.h | 36 +++ 9 files changed, 698 insertions(+), 2 deletions(-) create mode 100644 fs/fuse/fuse_dlm_cache.c create mode 100644 fs/fuse/fuse_dlm_cache.h diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3f0f312a31c1cc..d55e0e622e123b 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5c569c3cb53f3d..f678659bf44a04 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -6,6 +6,7 @@ See the file COPYING. */ +#include "fuse_dlm_cache.h" #include "fuse_i.h" #include @@ -1989,6 +1990,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * truncation has already been done by OPEN. But still * need to truncate page cache. */ + if (fc->dlm && fc->writeback_cache) + fuse_dlm_cache_release_locks(fi); i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; @@ -2094,6 +2097,9 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (fc->dlm && fc->writeback_cache) + fuse_dlm_unlock_range(fi, outarg.attr.size & PAGE_MASK, -1); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a52cf1b9cfc650..f968dde76c56b3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -1434,6 +1435,27 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!fc->handle_killpriv_v2 || !setattr_should_drop_suidgid(idmap, file_inode(file))) writeback = true; + + /* + * If we have dlm support acquire the lock for the area + * we are writing into. + */ + if (fc->dlm) { + /* + * Note that a file opened with O_APPEND will have + * relative values in ki_pos. This code is here for + * convenience and for libfuse overlay test. + * Filesystems should handle O_APPEND with 'direct io' + * to additionally get the performance benefits of + * 'parallel direct writes'. + */ + loff_t pos = file->f_flags & O_APPEND ? + i_size_read(inode) + iocb->ki_pos : + iocb->ki_pos; + size_t length = iov_iter_count(from); + + fuse_get_dlm_write_lock(file, pos, length); + } } inode_lock(inode); @@ -1453,7 +1475,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from)) goto out; written = direct_write_fallback(iocb, from, written, - fuse_perform_write(iocb, from)); + fuse_perform_write(iocb, from)); } else if (writeback) { /* * Use iomap so that we can do granular uptodate reads @@ -3119,6 +3141,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); + fuse_dlm_cache_init(fi); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c new file mode 100644 index 00000000000000..ea947f34a9f70a --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FUSE page lock cache implementation + */ +#include "fuse_i.h" +#include "fuse_dlm_cache.h" + +#include +#include +#include +#include + + +/* A range of pages with a lock */ +struct fuse_dlm_range { + /* Interval tree node */ + struct rb_node rb; + /* Start page offset (inclusive) */ + pgoff_t start; + /* End page offset (inclusive) */ + pgoff_t end; + /* Subtree end value for interval tree */ + pgoff_t __subtree_end; + /* Lock mode */ + enum fuse_page_lock_mode mode; + /* Temporary list entry for operations */ + struct list_head list; +}; + +/* Lock modes for FUSE page cache */ +#define FUSE_PCACHE_LK_READ 1 /* Shared read lock */ +#define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ + +/* Interval tree definitions for page ranges */ +static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +{ + return range->start; +} + +static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +{ + return range->end; +} + +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); + +/** + * fuse_page_cache_init - Initialize a page cache lock manager + * @cache: The cache to initialize + * + * Initialize a page cache lock manager for a FUSE inode. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_cache_init(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + + if (!cache) + return -EINVAL; + + init_rwsem(&cache->lock); + cache->ranges = RB_ROOT_CACHED; + + return 0; +} + +/** + * fuse_page_cache_destroy - Clean up a page cache lock manager + * @cache: The cache to clean up + * + * Release all locks and free all resources associated with the cache. + */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + struct rb_node *node; + + if (!cache) + return; + + /* Release all locks */ + down_write(&cache->lock); + while ((node = rb_first_cached(&cache->ranges)) != NULL) { + range = rb_entry(node, struct fuse_dlm_range, rb); + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + up_write(&cache->lock); +} + +/** + * fuse_dlm_find_overlapping - Find a range that overlaps with [start, end] + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Return: Pointer to the first overlapping range, or NULL if none found + */ +static struct fuse_dlm_range * +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + return fuse_page_it_iter_first(&cache->ranges, start, end); +} + +/** + * fuse_page_try_merge - Try to merge ranges within a specific region + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Attempt to merge ranges within and adjacent to the specified region + * that have the same lock mode. + */ +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *next; + struct rb_node *node; + + if (!cache) + return; + + /* Find the first range that might need merging */ + range = NULL; + node = rb_first_cached(&cache->ranges); + while (node) { + range = rb_entry(node, struct fuse_dlm_range, rb); + if (range->end >= start - 1) + break; + node = rb_next(node); + } + + if (!range || range->start > end + 1) + return; + + /* Try to merge ranges in and around the specified region */ + while (range && range->start <= end + 1) { + /* Get next range before we potentially modify the tree */ + next = NULL; + if (rb_next(&range->rb)) { + next = rb_entry(rb_next(&range->rb), + struct fuse_dlm_range, rb); + } + + /* Try to merge with next range if adjacent and same mode */ + if (next && range->mode == next->mode && + range->end + 1 == next->start) { + /* Merge ranges */ + range->end = next->end; + + /* Remove next from tree */ + fuse_page_it_remove(next, &cache->ranges); + kfree(next); + + /* Continue with the same range */ + continue; + } + + /* Move to next range */ + range = next; + } +} + +/** + * fuse_dlm_lock_range - Lock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode (read or write) + * + * Add a locked range on the specified range of pages. + * If parts of the range are already locked, only add the remaining parts. + * For overlapping ranges, handle lock compatibility: + * - READ locks are compatible with existing READ locks + * - READ locks are compatible with existing WRITE locks (downgrade not needed) + * - WRITE locks need to upgrade existing READ locks + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *new_range, *next; + int lock_mode; + int ret = 0; + LIST_HEAD(to_lock); + LIST_HEAD(to_upgrade); + pgoff_t current_start = start; + + if (!cache || start > end) + return -EINVAL; + + /* Convert to lock mode */ + lock_mode = (mode == FUSE_PAGE_LOCK_READ) ? FUSE_PCACHE_LK_READ : + FUSE_PCACHE_LK_WRITE; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check lock compatibility */ + if (lock_mode == FUSE_PCACHE_LK_WRITE && + lock_mode != range->mode) { + /* we own the lock but have to update it. */ + list_add_tail(&range->list, &to_upgrade); + } + /* If WRITE lock already exists - nothing to do */ + + /* If there's a gap before this range, we need to add the missing range */ + if (current_start < range->start) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = range->start - 1; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* Move current_start past this range */ + current_start = max(current_start, range->end + 1); + + /* Move to next range */ + range = next; + } + + /* If there's a gap after the last range to the end, extend the range */ + if (current_start <= end) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = end; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* update locks, if any lock is in this list it has the wrong mode */ + list_for_each_entry(range, &to_upgrade, list) { + /* Update the lock mode */ + range->mode = lock_mode; + } + + /* Add all new ranges to the tree */ + list_for_each_entry(new_range, &to_lock, list) { + /* Add to interval tree */ + fuse_page_it_insert(new_range, &cache->ranges); + } + + /* Try to merge adjacent ranges with the same mode */ + fuse_dlm_try_merge(cache, start, end); + + up_write(&cache->lock); + return 0; + +out_free: + /* Free any ranges we allocated but didn't insert */ + while (!list_empty(&to_lock)) { + new_range = + list_first_entry(&to_lock, struct fuse_dlm_range, list); + list_del(&new_range->list); + kfree(new_range); + } + + /* Restore original lock modes for any partially upgraded locks */ + list_for_each_entry(range, &to_upgrade, list) { + if (lock_mode == FUSE_PCACHE_LK_WRITE) { + /* We upgraded this lock but failed later, downgrade it back */ + range->mode = FUSE_PCACHE_LK_READ; + } + } + + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_punch_hole - Punch a hole in a locked range + * @cache: The page cache + * @start: Start page offset of the hole + * @end: End page offset of the hole + * + * Create a hole in a locked range by splitting it into two ranges. + * + * Return: 0 on success, negative error code on failure + */ +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *new_range; + int ret = 0; + + if (!cache || start > end) + return -EINVAL; + + /* Find a range that contains [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + if (!range) { + ret = -EINVAL; + goto out; + } + + /* If the hole is at the beginning of the range */ + if (start == range->start) { + range->start = end + 1; + goto out; + } + + /* If the hole is at the end of the range */ + if (end == range->end) { + range->end = start - 1; + goto out; + } + + /* The hole is in the middle, need to split */ + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out; + } + + /* Copy properties from original range */ + *new_range = *range; + INIT_LIST_HEAD(&new_range->list); + + /* Adjust ranges */ + new_range->start = end + 1; + range->end = start - 1; + + /* Update interval tree */ + fuse_page_it_remove(range, &cache->ranges); + fuse_page_it_insert(range, &cache->ranges); + fuse_page_it_insert(new_range, &cache->ranges); + +out: + return ret; +} + +/** + * fuse_dlm_unlock_range - Unlock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Release locks on the specified range of pages. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, + pgoff_t start, pgoff_t end) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *next; + int ret = 0; + + if (!cache) + return -EINVAL; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check if we need to punch a hole */ + if (start > range->start && end < range->end) { + /* Punch a hole in the middle */ + ret = fuse_dlm_punch_hole(cache, start, end); + if (ret) + goto out; + /* After punching a hole, we're done */ + break; + } else if (start > range->start) { + /* Adjust the end of the range */ + range->end = start - 1; + } else if (end < range->end) { + /* Adjust the start of the range */ + range->start = end + 1; + } else { + /* Complete overlap, remove the range */ + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + + range = next; + } + +out: + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_range_is_locked - Check if a page range is already locked + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode to check for (or NULL to check for any lock) + * + * Check if the specified range of pages is already locked. + * The entire range must be locked for this to return true. + * + * Return: true if the entire range is locked, false otherwise + */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + int lock_mode = 0; + pgoff_t current_start = start; + + if (!cache || start > end) + return false; + + /* Convert to lock mode if specified */ + if (mode == FUSE_PAGE_LOCK_READ) + lock_mode = FUSE_PCACHE_LK_READ; + else if (mode == FUSE_PAGE_LOCK_WRITE) + lock_mode = FUSE_PCACHE_LK_WRITE; + + down_read(&cache->lock); + + /* Find the first range that overlaps with [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + + /* Check if the entire range is covered */ + while (range && current_start <= end) { + /* If we're checking for a specific mode, verify it matches */ + if (lock_mode && range->mode != lock_mode) { + /* Wrong lock mode */ + up_read(&cache->lock); + return false; + } + + /* Check if there's a gap before this range */ + if (current_start < range->start) { + /* Found a gap */ + up_read(&cache->lock); + return false; + } + + /* Move current_start past this range */ + current_start = range->end + 1; + + /* Get next overlapping range */ + range = fuse_page_it_iter_next(range, start, end); + } + + /* Check if we covered the entire range */ + if (current_start <= end) { + /* There's a gap at the end */ + up_read(&cache->lock); + return false; + } + + up_read(&cache->lock); + return true; +} + +/** + * request a dlm lock from the fuse server + */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + + /* note that the offset and length don't have to be page aligned here + * but since we only get here on writeback caching we will send out + * page aligned requests */ + offset &= PAGE_MASK; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + /* note that this can be run from different processes + * at the same time. It is intentionally not protected + * since a DLM implementation in the FUSE server should take care + * of any races in lock requests */ + if (fuse_dlm_range_is_locked(fi, offset, + end, FUSE_PAGE_LOCK_WRITE)) + return; /* we already have this area locked */ + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = end - offset + 1; + inarg.type = FUSE_DLM_LOCK_WRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + /* fuse server does not support dlm, save the info */ + fc->dlm = 0; + return; + } + + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } + + if (err) + return; + else + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, + offset + outarg.locksize - 1, + FUSE_PAGE_LOCK_WRITE); +} diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h new file mode 100644 index 00000000000000..98b27a2c15d8ba --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * FUSE page cache lock implementation + */ + +#ifndef _FS_FUSE_DLM_CACHE_H +#define _FS_FUSE_DLM_CACHE_H + +#include +#include +#include +#include + + +struct fuse_inode; + +/* Lock modes for page ranges */ +enum fuse_page_lock_mode { FUSE_PAGE_LOCK_READ, FUSE_PAGE_LOCK_WRITE }; + +/* Page cache lock manager */ +struct fuse_dlm_cache { + /* Lock protecting the tree */ + struct rw_semaphore lock; + /* Interval tree of locked ranges */ + struct rb_root_cached ranges; +}; + +/* Initialize a page cache lock manager */ +int fuse_dlm_cache_init(struct fuse_inode *inode); + +/* Clean up a page cache lock manager */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode); + +/* Lock a range of pages */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* Unlock a range of pages */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end); + +/* Check if a page range is already locked */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* this is the interface to the filesystem */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length); + +#endif /* _FS_FUSE_DLM_CACHE_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e14..71d70f0037b165 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -31,6 +31,7 @@ #include #include #include +#include "fuse_dlm_cache.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -106,6 +107,17 @@ struct fuse_backing { struct rcu_head rcu; }; +/** + * data structure to save the information that we have + * requested dlm locks for the given area from the fuse server +*/ +struct dlm_locked_area +{ + struct list_head list; + loff_t offset; + size_t size; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -161,6 +173,9 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; + + /* dlm locked areas we have sent lock requests for */ + struct fuse_dlm_cache dlm_locked_areas; }; /* readdir cache (directory only) */ @@ -892,6 +907,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* do we have support for dlm in the fs? */ + unsigned int dlm:1; + /** Passthrough support for read/write IO */ unsigned int passthrough:1; diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 393c630e772635..9976e31a51a9c9 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,6 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b4d724336f2719..8b42d18c003a58 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include "dev_uring_i.h" #include @@ -194,6 +195,7 @@ static void fuse_evict_inode(struct inode *inode) WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); + fuse_dlm_cache_release_locks(fi); } } @@ -577,6 +579,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; + + if (fc->dlm && fc->writeback_cache) + /* invalidate the range from the beginning of the first page + * in the given range to the last byte of the last page */ + fuse_dlm_unlock_range(fi, + pg_start << PAGE_SHIFT, + (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -978,6 +988,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->blocked = 0; fc->initialized = 0; fc->connected = 1; + fc->dlm = 1; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 122d6586e8d4da..d9da5cf6372993 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -657,6 +657,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_DLM_WB_LOCK = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1227,6 +1228,41 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Type of the dlm lock requested + */ +enum fuse_dlm_lock_type { + FUSE_DLM_LOCK_NONE = 0, + FUSE_DLM_LOCK_READ = 1, + FUSE_DLM_LOCK_WRITE = 2 +}; + +/** + * struct fuse_dlm_lock_in - Lock request + * @fh: file handle + * @offset: offset into the file + * @size: size of the locked region + * @type: type of lock + */ +struct fuse_dlm_lock_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t type; + uint64_t reserved; +}; + +/** + * struct fuse_dlm_lock_out - Lock response + * @locksize: how many bytes where locked by the call + * (most of the time we want to lock more than is requested + * to reduce number of calls) + */ +struct fuse_dlm_lock_out { + uint32_t locksize; + uint32_t padding; +}; + /** * Size of the ring buffer header */ From a2e7c1ba712425b03c63ba4d000b65e4a451ef6e Mon Sep 17 00:00:00 2001 From: Yong Ze Chen Date: Tue, 8 Jul 2025 06:41:45 +0000 Subject: [PATCH 09/47] fuse: invalidate inode aliases when doing inode invalidation Add support to invalidate inode aliases when doing inode invalidation. This is useful for distributed file systems, which use DLM for cache coherency. So, when a client losts its inode lock, it should invalidate its inode cache and dentry cache since the other client may delete this file after getting inode lock. Signed-off-by: Yong Ze Chen (imported from commit 49720b5c84ada61feeb09da9ad4b9a0a40694792) --- fs/fuse/fuse_i.h | 6 +++++ fs/fuse/inode.c | 50 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 4 ++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 71d70f0037b165..881703571a2d68 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -773,6 +773,12 @@ struct fuse_conn { */ unsigned handle_killpriv_v2:1; + /* invalidate inode entries when doing inode invalidation */ + unsigned inval_inode_entries:1; + + /* expire inode entries when doing inode invalidation */ + unsigned expire_inode_entries:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8b42d18c003a58..1e4bb1575408fc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -554,6 +554,45 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, return NULL; } +static void fuse_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); +} + +static void fuse_invalidate_inode_entry(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + /* For directories, use d_invalidate to handle children and submounts */ + dentry = d_find_alias(inode); + if (dentry) { + d_invalidate(dentry); + fuse_invalidate_entry_cache(dentry); + dput(dentry); + } + } else { + /* For regular files, just unhash the dentry */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + } +} + int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { @@ -571,6 +610,11 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); + if (fc->inval_inode_entries) + fuse_invalidate_inode_entry(inode); + else if (fc->expire_inode_entries) + fuse_prune_aliases(inode); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -1452,6 +1496,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if (flags & FUSE_REQUEST_TIMEOUT) timeout = arg->request_timeout; + if (flags & FUSE_INVAL_INODE_ENTRY) + fc->inval_inode_entries = 1; + if (flags & FUSE_EXPIRE_INODE_ENTRY) + fc->expire_inode_entries = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1505,7 +1553,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | - FUSE_REQUEST_TIMEOUT; + FUSE_REQUEST_TIMEOUT | FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d9da5cf6372993..c6241e578db18f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -443,6 +443,8 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. * init_out.request_timeout contains the timeout (in secs) + * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation + * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -490,6 +492,8 @@ struct fuse_file_lock { #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) #define FUSE_REQUEST_TIMEOUT (1ULL << 42) +#define FUSE_INVAL_INODE_ENTRY (1ULL << 60) +#define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) /** * CUSE INIT request/reply flags From f5987159b6b632addde7fda9acff4014edbfb552 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Thu, 17 Jul 2025 17:04:16 +0000 Subject: [PATCH 10/47] fuse: Renumber FUSE_DLM_WB_LOCK to 100 Renumber the operation code to a high value to avoid conflicts with upstream. (imported from commit 27a0e9ea714f7fcf3ee40f977be6a17c10766509) --- fs/fuse/fuse_trace.h | 2 +- include/uapi/linux/fuse.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 9976e31a51a9c9..e81c93b9614627 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,7 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ - EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c6241e578db18f..cf9724e0a4e8a1 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -661,7 +661,9 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, - FUSE_DLM_WB_LOCK = 53, + + /* Operations which have not been merged into upstream */ + FUSE_DLM_WB_LOCK = 100, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1238,7 +1240,7 @@ struct fuse_supp_groups { enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, - FUSE_DLM_LOCK_WRITE = 2 + FUSE_DLM_LOCK_WRITE = 2, }; /** From c779a860b682f80a6be00331e34a21d33755d998 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:18:06 +0000 Subject: [PATCH 11/47] fuse: Send DLM_WB_LOCK request in page_mkwrite handler Send a DLM_WB_LOCK request in the page_mkwrite handler to enable FUSE filesystems to acquire a distributed lock manager (DLM) lock for protecting upcoming dirty pages when a previously read-only mapped page is about to be written. Signed-off-by: Cheng Ding (imported from commit ec36c455214837e9ce0d3f3385a0bb50dcfb51db) --- fs/fuse/file.c | 64 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 1 + 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f968dde76c56b3..f2ecb86de55795 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2283,6 +2283,57 @@ static void fuse_vma_close(struct vm_area_struct *vma) mapping_set_error(vma->vm_file->f_mapping, err); } +/** + * Request a DLM lock from the FUSE server. + * + * This routine is similar to fuse_get_dlm_write_lock(), but it + * does not cache the DLM lock in the kernel. + */ +static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = ff->fm; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + if (WARN_ON_ONCE((offset & ~PAGE_MASK) || (length & ~PAGE_MASK))) + return -EIO; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = length; + inarg.type = FUSE_DLM_PAGE_MKWRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->dlm = 0; + err = 0; + } + + if (!err && outarg.locksize < length) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", + length, outarg.locksize); + fuse_abort_conn(fc); + err = -EINVAL; + } + return err; +} /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly @@ -2301,7 +2352,18 @@ static void fuse_vma_close(struct vm_area_struct *vma) static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); - struct inode *inode = file_inode(vmf->vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + + if (fm->fc->dlm) { + loff_t pos = vmf->pgoff << PAGE_SHIFT; + size_t length = PAGE_SIZE; + int err = fuse_get_page_mkwrite_lock(file, pos, length); + if (err < 0) { + return vmf_error(err); + } + } file_update_time(vmf->vma->vm_file); folio_lock(folio); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index cf9724e0a4e8a1..b8da5cc6e159da 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1241,6 +1241,7 @@ enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, FUSE_DLM_LOCK_WRITE = 2, + FUSE_DLM_PAGE_MKWRITE = 3, }; /** From 1954b0e0280121b4a9f9cc51e13d6435d875d84e Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:20:08 +0000 Subject: [PATCH 12/47] fuse: Allow read_folio to retry page fault and read operations Allow read_folio to return EAGAIN error and translate it to AOP_TRUNCATE_PAGE to retry page fault and read operations. This is used to prevent deadlock of folio lock/DLM lock order reversal: - Fault or read operations acquire folio lock first, then DLM lock. - FUSE daemon blocks new DLM lock acquisition while it invalidating page cache. invalidate_inode_pages2_range() acquires folio lock To prevent deadlock, the FUSE daemon will fail its DLM lock acquisition with EAGAIN if it detects an in-flight page cache invalidating operation. Signed-off-by: Cheng Ding (imported from commit 8ecf1182053891c6458b10be1272d2d562492fbd) --- fs/fuse/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f2ecb86de55795..eda3b6866c3078 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -824,8 +824,11 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); - if (res < 0) + if (res < 0) { + if (res == -EAGAIN) + res = AOP_TRUNCATED_PAGE; return res; + } /* * Short read means EOF. If file size is larger, truncate it */ From eaab38b4404bfbb4d2d5c8e04a91f63697f6a861 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 17 Jul 2025 16:26:51 -0700 Subject: [PATCH 13/47] fuse: flush pending fuse events before aborting the connection generic/488 fails with fuse2fs in the following fashion: generic/488 _check_generic_filesystem: filesystem on /dev/sdf is inconsistent (see /var/tmp/fstests/generic/488.full for details) This test opens a large number of files, unlinks them (which really just renames them to fuse hidden files), closes the program, unmounts the filesystem, and runs fsck to check that there aren't any inconsistencies in the filesystem. Unfortunately, the 488.full file shows that there are a lot of hidden files left over in the filesystem, with incorrect link counts. Tracing fuse_request_* shows that there are a large number of FUSE_RELEASE commands that are queued up on behalf of the unlinked files at the time that fuse_conn_destroy calls fuse_abort_conn. Had the connection not aborted, the fuse server would have responded to the RELEASE commands by removing the hidden files; instead they stick around. Create a function to push all the background requests to the queue and then wait for the number of pending events to hit zero, and call this before fuse_abort_conn. That way, all the pending events are processed by the fuse server and we don't end up with a corrupt filesystem. Signed-off-by: Darrick J. Wong (imported from commit d4262f9cf5232394d518207863d1ad79f52b179e) --- fs/fuse/dev.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 1 + 3 files changed, 45 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 350f11334668db..759da4a8a3afb1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -2398,6 +2399,43 @@ static void end_polls(struct fuse_conn *fc) } } +/* + * Flush all pending requests and wait for them. Only call this function when + * it is no longer possible for other threads to add requests. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) +{ + unsigned long deadline; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + /* Push all the background requests to the queue. */ + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + spin_unlock(&fc->lock); + + /* + * Wait 30s for all the events to complete or abort. Touch the + * watchdog once per second so that we don't trip the hangcheck timer + * while waiting for the fuse server. + */ + deadline = jiffies + timeout; + smp_mb(); + while (fc->connected && + (!timeout || time_before(jiffies, deadline)) && + wait_event_timeout(fc->blocked_waitq, + !fc->connected || atomic_read(&fc->num_waiting) == 0, + HZ) == 0) + touch_softlockup_watchdog(); +} + /* * Abort all requests. * diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 881703571a2d68..6d0197bd1a6f5f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1290,6 +1290,12 @@ void fuse_wait_aborted(struct fuse_conn *fc); /* Check if any requests timed out */ void fuse_check_timeout(struct work_struct *work); +/** + * Flush all pending requests and wait for them. Takes an optional timeout + * in jiffies. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1e4bb1575408fc..9f52fa6bf5b239 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2116,6 +2116,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + fuse_flush_requests(fc, 30 * HZ); if (fc->destroy) fuse_send_destroy(fm); From feb65a0e2adea9ef7d048f5ef87acb316611b22d Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 17:24:42 +0200 Subject: [PATCH 14/47] fuse: Refactor io-uring bg queue flush and queue abort This is a preparation to allow fuse-io-uring bg queue flush from flush_bg_queue() This does two function renames: fuse_uring_flush_bg -> fuse_uring_flush_queue_bg fuse_uring_abort_end_requests -> fuse_uring_flush_bg And fuse_uring_abort_end_queue_requests() is moved to fuse_uring_stop_queues(). Signed-off-by: Bernd Schubert (imported from commit e70ef24251116bc7f591a9a856c371549cd5ae77) --- fs/fuse/dev_uring.c | 14 +++++++------- fs/fuse/dev_uring_i.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 544decbc123c64..dbc1a883b68b6f 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -65,7 +65,7 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } -static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +static void fuse_uring_flush_queue_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; @@ -106,7 +106,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } @@ -135,11 +135,11 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) fuse_dev_end_requests(&req_list); } -void fuse_uring_abort_end_requests(struct fuse_ring *ring) +void fuse_uring_flush_bg(struct fuse_conn *fc) { int qid; struct fuse_ring_queue *queue; - struct fuse_conn *fc = ring->fc; + struct fuse_ring *ring = fc->ring; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); @@ -151,10 +151,9 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); - fuse_uring_abort_end_queue_requests(queue); } } @@ -511,6 +510,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) if (!queue) continue; + fuse_uring_abort_end_queue_requests(queue); fuse_uring_teardown_entries(queue); } @@ -1532,7 +1532,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); /* diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index c89c7dc27c76c1..ea86d4084e7676 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -142,7 +142,7 @@ struct fuse_ring { bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); -void fuse_uring_abort_end_requests(struct fuse_ring *ring); +void fuse_uring_flush_bg(struct fuse_conn *fc); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); @@ -157,7 +157,7 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) return; if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_abort_end_requests(ring); + fuse_uring_flush_bg(fc); fuse_uring_stop_queues(ring); } } From b48d099c85bc3005e2fac0d8713b767678c20078 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 18:24:41 +0200 Subject: [PATCH 15/47] fuse: Flush the io-uring bg queue from fuse_uring_flush_bg This is useful to have a unique API to flush background requests. For example when the bg queue gets flushed before the remaining of fuse_conn_destroy(). Signed-off-by: Bernd Schubert (imported from commit fc4120cc58e7fbcb541bf2e9a72781b569561912) --- fs/fuse/dev.c | 2 ++ fs/fuse/dev_uring.c | 3 +++ fs/fuse/dev_uring_i.h | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 759da4a8a3afb1..c5661b1fb5dc57 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2421,6 +2421,8 @@ void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) spin_unlock(&fc->bg_lock); spin_unlock(&fc->lock); + fuse_uring_flush_bg(fc); + /* * Wait 30s for all the events to complete or abort. Touch the * watchdog once per second so that we don't trip the hangcheck timer diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index dbc1a883b68b6f..f7f54fce1fa947 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -141,6 +141,9 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) struct fuse_ring_queue *queue; struct fuse_ring *ring = fc->ring; + if (!ring) + return; + for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index ea86d4084e7676..c089a7943d2598 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -210,6 +210,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc) return false; } +static inline void fuse_uring_flush_bg(struct fuse_conn *fc) +{ +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ From d03365028d1573781ef7b576f2a0e750a495a1f0 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 15:54:09 +0200 Subject: [PATCH 16/47] fuse: fix unnecessary connection abort in dlm lock acquiring When calling the fuse server with a dlm request and the fuse server responds with some other error than ENOSYS most likely the lock size will be set to zero. In that case the kernel will abort the fuse connection. This is completely unnecessary. Signed-off-by: Horst Birthelmer (imported from commit 0bc2f9c39c52ad11a1753e5be376c424b06f43db) --- fs/fuse/fuse_dlm_cache.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index ea947f34a9f70a..a9cad2c1bd2174 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -533,19 +533,19 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, return; } - if (outarg.locksize < end - offset + 1) { - /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); - fuse_abort_conn(fc); - return; - } - if (err) return; else - /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } else { + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, offset + outarg.locksize - 1, FUSE_PAGE_LOCK_WRITE); + } } From f2c85b3cc52e75001d1d58edff7a58e8addb5f1b Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 18:15:55 +0200 Subject: [PATCH 17/47] fuse: fix connection abort on mmap when fuse server returns ENOSYS Check whether dlm is still enabled when interpreting the returned error from fuse server. Signed-off-by: Horst Birthelmer (imported from commit f6fbf7c7bfb976ae2a30b4d699770a13e699ff04) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index eda3b6866c3078..395ea400e41473 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2328,7 +2328,7 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && outarg.locksize < length) { + if (!err && fc->dlm && outarg.locksize < length) { /* fuse server is seriously broken */ pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", length, outarg.locksize); From 751660f8719598bbcde07abe443a83881967b999 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Fri, 5 Jul 2024 18:04:49 +0800 Subject: [PATCH 18/47] fuse: make foffset alignment opt-in for optimum backend performance Sometimes the file offset alignment needs to be opt-in to achieve the optimum performance at the backend store. For example when ErasureCode [1] is used at the backend store, the optimum write performance is achieved when the WRITE request is aligned with the stripe size of ErasureCode. Otherwise a non-aligned WRITE request needs to be split at the stripe size boundary. It is quite costly to handle these split partial requests, as firstly the whole stripe to which the split partial request belongs needs to be read out, then overwrite the read stripe buffer with the request, and finally write the whole stripe back to the persistent storage. Thus the backend store can suffer severe performance degradation when WRITE requests can not fit into one stripe exactly. The write performance can be 10x slower when the request is 256KB in size given 4MB stripe size. Also there can be 50% performance degradation in theory if the request is not stripe boundary aligned. Besides, the conveyed test indicates that, the non-alignment issue becomes more severe when decreasing fuse's max_ratio, maybe partly because the background writeback now is more likely to run parallelly with the dirtier. fuse's max_ratio ratio of aligned WRITE requests ---------------- ------------------------------- 70 99.9% 40 74% 20 45% 10 20% With the patched version, which makes the alignment constraint opt-in when constructing WRITE requests, the ratio of aligned WRITE requests increases to 98% (previously 20%) when fuse's max_ratio is 10. fuse: fix alignment to work with redfs ubuntu - small fix to make the fuse alignment patch work with redfs ubuntu 6.8.x - add writeback_control to fuse_writepage_need_send() to make more accurate decisions about when to skip sending data - fix shift number for FUSE_ALIGN_PG_ORDER - remove test code [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4 Signed-off-by: Jingbo Xu Signed-off-by: Bernd Schubert Signed-off-by: Horst Birthelmer (imported from commit 5e590a657460229e5cc8b05c5477a47955c96885) --- fs/fuse/file.c | 28 ++++++++++++++++++++++++---- fs/fuse/fuse_i.h | 4 ++++ fs/fuse/inode.c | 8 ++++++++ include/uapi/linux/fuse.h | 12 +++++++++++- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 395ea400e41473..51f71acbeabe53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2126,9 +2126,12 @@ static void fuse_writepages_send(struct inode *inode, spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, - unsigned len, struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) + +static bool fuse_writepage_need_send(struct fuse_conn *fc, + loff_t pos, unsigned len, + struct fuse_args_pages *ap, + struct fuse_fill_wb_data *data, + struct writeback_control *wbc) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; @@ -2157,6 +2160,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, !fuse_pages_realloc(data, fc->max_pages)) return true; + /* Reached alignment */ + if (fc->alignment_pages) { + unsigned int total_pages = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t page_index = pos >> PAGE_SHIFT; + + if (!(page_index % fc->alignment_pages)) { + pgoff_t end_page_index = (wbc->range_end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* we are at a point where we would write aligned + * check if we potentially could reach the next alignment */ + if (page_index + fc->alignment_pages > end_page_index) + return true; + + if (total_pages + fc->alignment_pages > fc->max_pages) + return true; + } + } return false; } @@ -2180,7 +2200,7 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, return -EIO; } - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data, wpc->wbc)) { fuse_writepages_send(inode, data); data->wpa = NULL; data->nr_bytes = 0; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6d0197bd1a6f5f..63731c853c7e50 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1007,6 +1007,10 @@ struct fuse_conn { * inode->i_blkbits. */ u8 blkbits; + + /* The foffset alignment in PAGE */ + unsigned int alignment_pages; + }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9f52fa6bf5b239..d74e3e48056f87 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1483,6 +1483,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } + + if (flags & FUSE_ALIGN_PG_ORDER) { + if (arg->align_page_order > 0) { + fc->alignment_pages = + (1UL << arg->align_page_order) + >> PAGE_SHIFT; + } + } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; if (flags & FUSE_ALLOW_IDMAP) { diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index b8da5cc6e159da..15e18790d1ca8a 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -445,6 +445,8 @@ struct fuse_file_lock { * init_out.request_timeout contains the timeout (in secs) * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation + * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for + * optimal io-size alignment */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -492,6 +494,9 @@ struct fuse_file_lock { #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) #define FUSE_REQUEST_TIMEOUT (1ULL << 42) + +#define FUSE_ALIGN_PG_ORDER (1ULL << 50) + #define FUSE_INVAL_INODE_ENTRY (1ULL << 60) #define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) @@ -912,6 +917,9 @@ struct fuse_init_in { #define FUSE_COMPAT_INIT_OUT_SIZE 8 #define FUSE_COMPAT_22_INIT_OUT_SIZE 24 +/* + * align_page_order: Number of pages for optimal IO, or a multiple of that + */ struct fuse_init_out { uint32_t major; uint32_t minor; @@ -926,7 +934,9 @@ struct fuse_init_out { uint32_t flags2; uint32_t max_stack_depth; uint16_t request_timeout; - uint16_t unused[11]; + uint8_t align_page_order; + uint8_t padding; + uint16_t unused[10]; }; #define CUSE_INIT_INFO_MAX 4096 From e41c4d9d8c0fe5ff02e472a5d2a78d5e7fe04c08 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Wed, 20 Aug 2025 16:56:43 +0200 Subject: [PATCH 19/47] fuse: change FUSE DLM_LOCK to request start and end of area - Increase the possible lock size to 64 bit. - change semantics of DLM locks to request start and end - change semantics of DLM request return to mark start and end of the locked area - better prepare dlm lock range cache rb-tree for unaligned byte range locks which could return any value as long as it is larger than the range requested - add the case where start and end are zero to destroy the cache Signed-off-by: Horst Birthelmer (imported from commit 87968c738b67b07084b19b5e727074c0604d7ba6) --- fs/fuse/file.c | 13 +++++--- fs/fuse/fuse_dlm_cache.c | 67 +++++++++++++++++++++------------------ fs/fuse/fuse_dlm_cache.h | 12 +++---- fs/fuse/inode.c | 11 ++++--- include/uapi/linux/fuse.h | 11 ++++--- 5 files changed, 64 insertions(+), 50 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 51f71acbeabe53..bbbc95629fdf4b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2330,8 +2330,8 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.offset = offset; - inarg.size = length; + inarg.start = offset; + inarg.end = offset + length - 1; inarg.type = FUSE_DLM_PAGE_MKWRITE; args.opcode = FUSE_DLM_WB_LOCK; @@ -2348,10 +2348,13 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && fc->dlm && outarg.locksize < length) { + if (!err && + fc->dlm && + (outarg.start > inarg.start || + outarg.end < inarg.end)) { /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", - length, outarg.locksize); + pr_warn("fuse: dlm lock request for %llu:%llu bytes returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); fuse_abort_conn(fc); err = -EINVAL; } diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index a9cad2c1bd2174..d765dd8018cc6a 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -16,11 +16,11 @@ struct fuse_dlm_range { /* Interval tree node */ struct rb_node rb; /* Start page offset (inclusive) */ - pgoff_t start; + uint64_t start; /* End page offset (inclusive) */ - pgoff_t end; + uint64_t end; /* Subtree end value for interval tree */ - pgoff_t __subtree_end; + uint64_t __subtree_end; /* Lock mode */ enum fuse_page_lock_mode mode; /* Temporary list entry for operations */ @@ -32,19 +32,19 @@ struct fuse_dlm_range { #define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ /* Interval tree definitions for page ranges */ -static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +static inline uint64_t fuse_dlm_range_start(struct fuse_dlm_range *range) { return range->start; } -static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +static inline uint64_t fuse_dlm_range_last(struct fuse_dlm_range *range) { return range->end; } -INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, - fuse_dlm_range_start, fuse_dlm_range_last, static, - fuse_page_it); +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, uint64_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); /** * fuse_page_cache_init - Initialize a page cache lock manager @@ -101,8 +101,8 @@ void fuse_dlm_cache_release_locks(struct fuse_inode *inode) * Return: Pointer to the first overlapping range, or NULL if none found */ static struct fuse_dlm_range * -fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { return fuse_page_it_iter_first(&cache->ranges, start, end); } @@ -116,8 +116,8 @@ fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, * Attempt to merge ranges within and adjacent to the specified region * that have the same lock mode. */ -static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { struct fuse_dlm_range *range, *next; struct rb_node *node; @@ -182,8 +182,8 @@ static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, * * Return: 0 on success, negative error code on failure */ -int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode) +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range, *new_range, *next; @@ -191,7 +191,7 @@ int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, int ret = 0; LIST_HEAD(to_lock); LIST_HEAD(to_upgrade); - pgoff_t current_start = start; + uint64_t current_start = start; if (!cache || start > end) return -EINVAL; @@ -304,8 +304,8 @@ int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, * * Return: 0 on success, negative error code on failure */ -static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { struct fuse_dlm_range *range, *new_range; int ret = 0; @@ -363,11 +363,12 @@ static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, * @end: End page offset * * Release locks on the specified range of pages. + * Note that if start and end are set to zero the cache is destroyed. * * Return: 0 on success, negative error code on failure */ int fuse_dlm_unlock_range(struct fuse_inode *inode, - pgoff_t start, pgoff_t end) + uint64_t start, uint64_t end) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range, *next; @@ -376,6 +377,11 @@ int fuse_dlm_unlock_range(struct fuse_inode *inode, if (!cache) return -EINVAL; + if (start == 0 && end == 0) { + fuse_dlm_cache_release_locks(inode); + return 0; + } + down_write(&cache->lock); /* Find all ranges that overlap with [start, end] */ @@ -424,13 +430,13 @@ int fuse_dlm_unlock_range(struct fuse_inode *inode, * * Return: true if the entire range is locked, false otherwise */ -bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode) +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range; int lock_mode = 0; - pgoff_t current_start = start; + uint64_t current_start = start; if (!cache || start > end) return false; @@ -491,7 +497,7 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_mount *fm = ff->fm; - loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + uint64_t end = (offset + length - 1) | (PAGE_SIZE - 1); /* note that the offset and length don't have to be page aligned here * but since we only get here on writeback caching we will send out @@ -514,8 +520,8 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.offset = offset; - inarg.size = end - offset + 1; + inarg.start = offset; + inarg.end = end; inarg.type = FUSE_DLM_LOCK_WRITE; args.opcode = FUSE_DLM_WB_LOCK; @@ -536,16 +542,17 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, if (err) return; else - if (outarg.locksize < end - offset + 1) { + if (inarg.start < outarg.start || + inarg.end > outarg.end) { /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); + pr_warn("fuse: dlm lock request for %llu:%llu returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); fuse_abort_conn(fc); return; } else { /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, - offset + outarg.locksize - 1, - FUSE_PAGE_LOCK_WRITE); + fuse_dlm_lock_range(fi, outarg.start, + outarg.end, + FUSE_PAGE_LOCK_WRITE); } } diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h index 98b27a2c15d8ba..438d31d28b666e 100644 --- a/fs/fuse/fuse_dlm_cache.h +++ b/fs/fuse/fuse_dlm_cache.h @@ -32,16 +32,16 @@ int fuse_dlm_cache_init(struct fuse_inode *inode); void fuse_dlm_cache_release_locks(struct fuse_inode *inode); /* Lock a range of pages */ -int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode); +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); /* Unlock a range of pages */ -int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end); +int fuse_dlm_unlock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end); /* Check if a page range is already locked */ -bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode); +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); /* this is the interface to the filesystem */ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d74e3e48056f87..555aace561664c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -625,11 +625,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = (offset + len - 1) >> PAGE_SHIFT; if (fc->dlm && fc->writeback_cache) - /* invalidate the range from the beginning of the first page - * in the given range to the last byte of the last page */ + /* Invalidate the range exactly as the fuse server requested + * except for the case where it sends -1. + * Note that this can lead to some inconsistencies if + * the fuse server sends unaligned data */ fuse_dlm_unlock_range(fi, - pg_start << PAGE_SHIFT, - (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + offset, + pg_end == -1 ? 0 : + (offset + len - 1)); invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 15e18790d1ca8a..cdbc01b7a21d16 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1263,10 +1263,10 @@ enum fuse_dlm_lock_type { */ struct fuse_dlm_lock_in { uint64_t fh; - uint64_t offset; - uint32_t size; + uint64_t start; + uint64_t end; uint32_t type; - uint64_t reserved; + uint32_t reserved; }; /** @@ -1276,8 +1276,9 @@ struct fuse_dlm_lock_in { * to reduce number of calls) */ struct fuse_dlm_lock_out { - uint32_t locksize; - uint32_t padding; + uint64_t start; + uint64_t end; + uint64_t reserved; }; /** From cf94d3c8a8cdefe3794a5cac830e8eda0299322c Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 24 Sep 2025 08:12:17 +0000 Subject: [PATCH 20/47] fuse: fix memory leak in fuse-over-io-uring argument copies Fix reference count leak of payload pages during fuse argument copies. Signed-off-by: Cheng Ding (imported from commit 8b75cf05a2efc20e8f46ba9e10664c502249ee21) --- fs/fuse/dev.c | 2 +- fs/fuse/dev_uring.c | 13 ++++++++++--- fs/fuse/fuse_dev_i.h | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c5661b1fb5dc57..eb8d8551193dd5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -830,7 +830,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write, } /* Unmap and put previous page of userspace buffer */ -static void fuse_copy_finish(struct fuse_copy_state *cs) +void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f7f54fce1fa947..3738f43a4ee4db 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -638,7 +638,9 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, if (ent->payload_pages) cs.ring.pages = ent->payload_pages; - return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + fuse_copy_finish(&cs); + return err; } /* @@ -685,11 +687,14 @@ static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + +copy_finish: + fuse_copy_finish(&cs); return err; } @@ -745,12 +750,14 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); +copy_finish: + fuse_copy_finish(&cs); return err ? -EFAULT : 0; } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 1e91079947f82b..fab53a1cc37ed5 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -55,6 +55,7 @@ void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); +void fuse_copy_finish(struct fuse_copy_state *cs); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); From 005f3ddfe1c48864fb30363ab3481600c8173870 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 2 Jun 2025 23:23:43 +0200 Subject: [PATCH 21/47] fuse: {io-uring} Add queue length counters This is another preparation and will be used for decision which queue to add a request to. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong (imported from commit e4698faf912435f7f3f28c169f7bb8342d7b1edf) --- fs/fuse/dev_uring.c | 17 +++++++++++++++-- fs/fuse/dev_uring_i.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 3738f43a4ee4db..1f7f4265a76210 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -103,13 +103,13 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + queue->nr_reqs--; if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } - spin_unlock(&queue->lock); if (error) @@ -129,6 +129,7 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); + queue->nr_reqs = 0; spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ @@ -1496,10 +1497,13 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); + queue->nr_reqs++; + if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); if (ent) @@ -1535,6 +1539,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) set_bit(FR_URING, &req->flags); req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); + queue->nr_reqs++; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); @@ -1567,8 +1572,16 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) bool fuse_uring_remove_pending_req(struct fuse_req *req) { struct fuse_ring_queue *queue = req->ring_queue; + bool removed = fuse_remove_pending_req(req, &queue->lock); + + if (removed) { + /* Update counters after successful removal */ + spin_lock(&queue->lock); + queue->nr_reqs--; + spin_unlock(&queue->lock); + } - return fuse_remove_pending_req(req, &queue->lock); + return removed; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index c089a7943d2598..e1a3ab8e08f734 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -98,6 +98,9 @@ struct fuse_ring_queue { /* background fuse requests */ struct list_head fuse_req_bg_queue; + /* number of requests queued or in userspace */ + unsigned int nr_reqs; + struct fuse_pqueue fpq; unsigned int active_background; From 0a2127d94c5742f87166b70657f0cb811cc30486 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 13 Jun 2025 15:12:47 +0200 Subject: [PATCH 22/47] fuse: {io-uring} Rename ring->nr_queues to max_nr_queues This is preparation for follow up commits that allow to run with a reduced number of queues. Signed-off-by: Bernd Schubert (imported from commit 2e27c33ffcf65b434ada1364a4d2ea92b094f0c3) --- fs/fuse/dev_uring.c | 24 ++++++++++++------------ fs/fuse/dev_uring_i.h | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 1f7f4265a76210..a7eb333c24bfde 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -145,7 +145,7 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -185,7 +185,7 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) if (!ring) return false; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -227,7 +227,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -290,7 +290,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - ring->nr_queues = nr_queues; + ring->max_nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; smp_store_release(&fc->ring, ring); @@ -442,7 +442,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) int qid; struct fuse_ring_ent *ent; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) @@ -473,7 +473,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -508,7 +508,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -1005,7 +1005,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, if (!ring) return err; - if (qid >= ring->nr_queues) + if (qid >= ring->max_nr_queues) return -EINVAL; queue = ring->queues[qid]; @@ -1068,7 +1068,7 @@ static bool is_ring_ready(struct fuse_ring *ring, int current_qid) struct fuse_ring_queue *queue; bool ready = true; - for (qid = 0; qid < ring->nr_queues && ready; qid++) { + for (qid = 0; qid < ring->max_nr_queues && ready; qid++) { if (current_qid == qid) continue; @@ -1307,7 +1307,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, return err; } - if (qid >= ring->nr_queues) { + if (qid >= ring->max_nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } @@ -1436,9 +1436,9 @@ static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) qid = task_cpu(current); - if (WARN_ONCE(qid >= ring->nr_queues, + if (WARN_ONCE(qid >= ring->max_nr_queues, "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->nr_queues)) + ring->max_nr_queues)) qid = 0; queue = ring->queues[qid]; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index e1a3ab8e08f734..1a0ece0f92d08a 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -117,7 +117,7 @@ struct fuse_ring { struct fuse_conn *fc; /* number of ring queues */ - size_t nr_queues; + size_t max_nr_queues; /* maximum payload/arg size */ size_t max_payload_sz; From 3766d586fe5b49178cc210ca2c06e61f1d39b4b1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 10 Jun 2025 16:23:28 +0200 Subject: [PATCH 23/47] fuse: {io-uring} Use bitmaps to track registered queues Add per-CPU and per-NUMA node bitmasks to track which io-uring queues are registered. Signed-off-by: Bernd Schubert (imported from commit be6edce441ecc37ee34a8937f07c01ab99bfb7f7) --- fs/fuse/dev_uring.c | 72 +++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 20 ++++++++++++ 2 files changed, 92 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index a7eb333c24bfde..30f848505c439d 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -219,6 +219,23 @@ static void io_pages_free(struct page ***pages, int npages) *pages = NULL; } +static void fuse_ring_destruct_q_map(struct fuse_queue_map *q_map) +{ + free_cpumask_var(q_map->registered_q_mask); + kfree(q_map->cpu_to_qid); +} + +static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) +{ + int node; + + fuse_ring_destruct_q_map(&ring->q_map); + + if (ring->numa_q_map) + for (node = 0; node < ring->nr_numa_nodes; node++) + fuse_ring_destruct_q_map(&ring->numa_q_map[node]); +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -253,11 +270,45 @@ void fuse_uring_destruct(struct fuse_conn *fc) ring->queues[qid] = NULL; } + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); fc->ring = NULL; } +static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) +{ + if (!zalloc_cpumask_var(&q_map->registered_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), + GFP_KERNEL_ACCOUNT); + + return 0; +} + +static int fuse_uring_create_q_masks(struct fuse_ring *ring) +{ + int err, node; + + err = fuse_uring_init_q_map(&ring->q_map, ring->max_nr_queues); + if (err) + return err; + + ring->numa_q_map = kcalloc(ring->nr_numa_nodes, + sizeof(*ring->numa_q_map), + GFP_KERNEL_ACCOUNT); + if (!ring->numa_q_map) + return -ENOMEM; + for (node = 0; node < ring->nr_numa_nodes; node++) { + err = fuse_uring_init_q_map(&ring->numa_q_map[node], + ring->max_nr_queues); + if (err) + return err; + } + return 0; +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -267,11 +318,14 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; + int err; ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); if (!ring) return NULL; + ring->nr_numa_nodes = num_online_nodes(); + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), GFP_KERNEL_ACCOUNT); if (!ring->queues) @@ -280,6 +334,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + err = fuse_uring_create_q_masks(ring); + if (err) + goto out_err; + spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ @@ -299,6 +357,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return ring; out_err: + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); return res; @@ -461,6 +520,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } + spin_unlock(&queue->lock); } ring->stop_debug_log = 1; @@ -507,6 +567,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; + int node; for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); @@ -518,6 +579,13 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) fuse_uring_teardown_entries(queue); } + /* Reset all queue masks, we won't process any more IO */ + cpumask_clear(ring->q_map.registered_q_mask); + for (node = 0; node < ring->nr_numa_nodes; node++) { + if (ring->numa_q_map) + cpumask_clear(ring->numa_q_map[node].registered_q_mask); + } + if (atomic_read(&ring->queue_refs) > 0) { ring->teardown_time = jiffies; INIT_DELAYED_WORK(&ring->async_teardown_work, @@ -1098,6 +1166,10 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; struct fuse_iqueue *fiq = &fc->iq; + int node = cpu_to_node(queue->qid); + + if (WARN_ON_ONCE(node >= ring->nr_numa_nodes)) + node = 0; fuse_uring_prepare_cancel(cmd, issue_flags, ent); diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 1a0ece0f92d08a..4caf7626604c8e 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -108,6 +108,17 @@ struct fuse_ring_queue { bool stopped; }; +struct fuse_queue_map { + /* Tracks which queues are registered */ + cpumask_var_t registered_q_mask; + + /* number of registered queues */ + size_t nr_queues; + + /* cpu to qid mapping */ + int *cpu_to_qid; +}; + /** * Describes if uring is for communication and holds alls the data needed * for uring communication @@ -119,6 +130,9 @@ struct fuse_ring { /* number of ring queues */ size_t max_nr_queues; + /* number of numa nodes */ + int nr_numa_nodes; + /* maximum payload/arg size */ size_t max_payload_sz; @@ -129,6 +143,12 @@ struct fuse_ring { */ unsigned int stop_debug_log : 1; + /* per numa node queue tracking */ + struct fuse_queue_map *numa_q_map; + + /* all queue tracking */ + struct fuse_queue_map q_map; + wait_queue_head_t stop_waitq; /* async tear down */ From 2b1fec0bb6b30efe8019e14e5339f4365a82d54a Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 4 Jun 2025 19:32:39 +0200 Subject: [PATCH 24/47] fuse: {io-uring} Allow reduced number of ring queues Queues selection (fuse_uring_get_queue) can handle reduced number queues - using io-uring is possible now even with a single queue and entry. The FUSE_URING_REDUCED_Q flag is being introduce tell fuse server that reduced queues are possible, i.e. if the flag is set, fuse server is free to reduce number queues. Signed-off-by: Bernd Schubert (imported from commit f620f3d35969bd9a04304b757a18a11a0787dedc) --- fs/fuse/dev_uring.c | 124 +++++++++++++++++++++++--------------- fs/fuse/inode.c | 6 +- include/uapi/linux/fuse.h | 4 +- 3 files changed, 82 insertions(+), 52 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 30f848505c439d..66b6f882a0d69d 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -283,15 +283,17 @@ static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), GFP_KERNEL_ACCOUNT); + if (!q_map->cpu_to_qid) + return -ENOMEM; return 0; } -static int fuse_uring_create_q_masks(struct fuse_ring *ring) +static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues) { int err, node; - err = fuse_uring_init_q_map(&ring->q_map, ring->max_nr_queues); + err = fuse_uring_init_q_map(&ring->q_map, nr_queues); if (err) return err; @@ -302,7 +304,7 @@ static int fuse_uring_create_q_masks(struct fuse_ring *ring) return -ENOMEM; for (node = 0; node < ring->nr_numa_nodes; node++) { err = fuse_uring_init_q_map(&ring->numa_q_map[node], - ring->max_nr_queues); + nr_queues); if (err) return err; } @@ -334,7 +336,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); - err = fuse_uring_create_q_masks(ring); + err = fuse_uring_create_q_masks(ring, nr_queues); if (err) goto out_err; @@ -363,12 +365,37 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return res; } +static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, + struct fuse_queue_map *q_map) +{ + int cpu, qid_idx; + size_t nr_queues; + + cpumask_set_cpu(qid, q_map->registered_q_mask); + nr_queues = cpumask_weight(q_map->registered_q_mask); + for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { + if (!q_map->cpu_to_qid) + return; + + /* + * Position of this CPU within the registered queue mask, + * handles non-contiguous CPU distributions across NUMA nodes. + */ + qid_idx = bitmap_weight( + cpumask_bits(q_map->registered_q_mask), cpu); + + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx % nr_queues, + q_map->registered_q_mask); + } +} + static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, int qid) { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; struct list_head *pq; + int node; queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); if (!queue) @@ -406,6 +433,22 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, * write_once and lock as the caller mostly doesn't take the lock at all */ WRITE_ONCE(ring->queues[qid], queue); + + /* Static mapping from cpu to per numa queues */ + node = cpu_to_node(qid); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node]); + + /* + * smp_store_release, as the variable is read without fc->lock and + * we need to avoid compiler re-ordering of updating the nr_queues + * and setting ring->numa_queues[node].cpu_to_qid above + */ + smp_store_release (&ring->numa_q_map[node].nr_queues, + ring->numa_q_map[node].nr_queues + 1); + + /* global mapping */ + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map); + spin_unlock(&fc->lock); return queue; @@ -1130,31 +1173,6 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -static bool is_ring_ready(struct fuse_ring *ring, int current_qid) -{ - int qid; - struct fuse_ring_queue *queue; - bool ready = true; - - for (qid = 0; qid < ring->max_nr_queues && ready; qid++) { - if (current_qid == qid) - continue; - - queue = ring->queues[qid]; - if (!queue) { - ready = false; - break; - } - - spin_lock(&queue->lock); - if (list_empty(&queue->ent_avail_queue)) - ready = false; - spin_unlock(&queue->lock); - } - - return ready; -} - /* * fuse_uring_req_fetch command handling */ @@ -1179,13 +1197,9 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, spin_unlock(&queue->lock); if (!ring->ready) { - bool ready = is_ring_ready(ring, queue->qid); - - if (ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); } } @@ -1501,22 +1515,36 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) { unsigned int qid; - struct fuse_ring_queue *queue; + int node; + unsigned int nr_queues; + unsigned int cpu = task_cpu(current); - qid = task_cpu(current); + cpu = cpu % ring->max_nr_queues; - if (WARN_ONCE(qid >= ring->max_nr_queues, - "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->max_nr_queues)) - qid = 0; + /* numa local registered queue bitmap */ + node = cpu_to_node(cpu); + if (WARN_ONCE(node >= ring->nr_numa_nodes, + "Node number (%d) exceeds nr nodes (%d)\n", + node, ring->nr_numa_nodes)) { + node = 0; + } - queue = ring->queues[qid]; - WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); + if (nr_queues) { + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + return READ_ONCE(ring->queues[qid]); + } - return queue; + /* global registered queue bitmap */ + qid = ring->q_map.cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + return READ_ONCE(ring->queues[qid]); } static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) @@ -1556,7 +1584,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring); if (!queue) goto err; @@ -1598,7 +1626,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring); if (!queue) return false; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 555aace561664c..c23eb6531bdcb4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1550,8 +1550,7 @@ void fuse_send_init(struct fuse_mount *fm) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - flags = - FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -1564,7 +1563,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | - FUSE_REQUEST_TIMEOUT | FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY; + FUSE_REQUEST_TIMEOUT | FUSE_INVAL_INODE_ENTRY | + FUSE_EXPIRE_INODE_ENTRY | FUSE_URING_REDUCED_Q; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index cdbc01b7a21d16..40e26460d69856 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -447,6 +447,8 @@ struct fuse_file_lock { * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for * optimal io-size alignment + * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free + * to register between 1 and nr-core io-uring queues */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -496,7 +498,7 @@ struct fuse_file_lock { #define FUSE_REQUEST_TIMEOUT (1ULL << 42) #define FUSE_ALIGN_PG_ORDER (1ULL << 50) - +#define FUSE_URING_REDUCED_Q (1ULL << 59) #define FUSE_INVAL_INODE_ENTRY (1ULL << 60) #define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) From da27244741ccdb4c8b19177b6ee0532c08e01f9b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 24 Sep 2025 19:14:19 +0200 Subject: [PATCH 25/47] fuse: {io-uring} Queue background requests on a different core Running background IO on a different core makes quite a difference. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=272MiB/s (285MB/s) ... patched READ: bw=650MiB/s (682MB/s) Reason is easily visible, the fio process is migrating between CPUs when requests are submitted on the queue for the same core. With --iodepth=8 unpatched READ: bw=466MiB/s (489MB/s) patched READ: bw=641MiB/s (672MB/s) Without io-uring (--iodepth=8) READ: bw=729MiB/s (764MB/s) Without fuse (--iodepth=8) READ: bw=2199MiB/s (2306MB/s) (Test were done with /example/passthrough_hp -o allow_other --nopassthrough \ [-o io_uring] /tmp/source /tmp/dest ) Additional notes: With FURING_NEXT_QUEUE_RETRIES=0 (--iodepth=8) READ: bw=903MiB/s (946MB/s) With just a random qid (--iodepth=8) READ: bw=429MiB/s (450MB/s) With --iodepth=1 unpatched READ: bw=195MiB/s (204MB/s) patched READ: bw=232MiB/s (243MB/s) With --iodepth=1 --numjobs=2 unpatched READ: bw=366MiB/s (384MB/s) patched READ: bw=472MiB/s (495MB/s) With --iodepth=1 --numjobs=8 unpatched READ: bw=1437MiB/s (1507MB/s) patched READ: bw=1529MiB/s (1603MB/s) fuse without io-uring READ: bw=1314MiB/s (1378MB/s), 1314MiB/s-1314MiB/s ... no-fuse READ: bw=2566MiB/s (2690MB/s), 2566MiB/s-2566MiB/s ... In summary, for async requests the core doing application IO is busy sending requests and processing IOs should be done on a different core. Spreading the load on random cores is also not desirable, as the core might be frequency scaled down and/or in C1 sleep states. Not shown here, but differnces are much smaller when the system uses performance govenor instead of schedutil (ubuntu default). Obviously at the cost of higher system power consumption for performance govenor - not desirable either. Results without io-uring (which uses fixed libfuse threads per queue) heavily depend on the current number of active threads. Libfuse uses default of max 10 threads, but actual nr max threads is a parameter. Also, no-fuse-io-uring results heavily depend on, if there was already running another workload before, as libfuse starts these threads dynamically - i.e. the more threads are active, the worse the performance. Signed-off-by: Bernd Schubert (imported from commit c6399ea79b104ac79758f2c36f1977b80a02358d) --- fs/fuse/dev_uring.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 66b6f882a0d69d..d16ccf230bee6f 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1515,13 +1515,21 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, + bool background) { unsigned int qid; int node; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + /* + * Background requests result in better performance on a different + * CPU, unless CPUs are already busy. + */ + if (background) + cpu++; + cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1584,7 +1592,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, false); if (!queue) goto err; @@ -1626,7 +1634,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, true); if (!queue) return false; From e56d7989f2a6fa1d1e592a9ec91e057fa26888a8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 24 Oct 2025 19:05:07 +0200 Subject: [PATCH 26/47] fuse: Add retry attempts for numa local queues for load distribution This is to further improve performance. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=650MiB/s (682MB/s) patched: READ: bw=995MiB/s (1043MB/s) with --iodepth=8 unpatched READ: bw=641MiB/s (672MB/s) patched READ: bw=966MiB/s (1012MB/s) Reason is that with --iodepth=x (x > 1) fio submits multiple async requests and a single queue might become CPU limited. I.e. spreading the load helps. (imported from commit 2e73b0be1f55d61c2d861a12bf6bb9963b9b877a) --- fs/fuse/dev_uring.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index d16ccf230bee6f..6fe552e7bf9558 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -22,6 +22,8 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +#define FUSE_URING_Q_THRESHOLD 2 + /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) @@ -1519,9 +1521,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node; + int node, retries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + struct fuse_ring_queue *queue, *primary_queue = NULL; /* * Background requests result in better performance on a different @@ -1530,6 +1533,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, if (background) cpu++; +retry: cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1545,12 +1549,35 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - return READ_ONCE(ring->queues[qid]); + queue = READ_ONCE(ring->queues[qid]); + + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; + + if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) + return queue; + + /* Retries help for load balancing */ + if (retries < FUSE_URING_Q_THRESHOLD) { + if (!retries) + primary_queue = queue; + + /* Increase cpu, assuming it will map to a differet qid*/ + cpu++; + retries++; + goto retry; + } } + /* Retries exceeded, take the primary target queue */ + if (primary_queue) + return primary_queue; + /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + /* Might happen on teardown */ return NULL; return READ_ONCE(ring->queues[qid]); } From 592d08305c9664405d70643d10cc024368adee95 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 22 Oct 2025 22:58:13 +0200 Subject: [PATCH 27/47] fuse: Invalidate the page cache after FOPEN_DIRECT_IO write generic_file_direct_write() also does this and has a large comment about. Reproducer here is xfstest's generic/209, which is exactly to have competing DIO write and cached IO read. Signed-off-by: Bernd Schubert (imported from commit 9e04c8aa32d008c44aab5348e4e039ed8e78ca47) --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bbbc95629fdf4b..3c22e9ca507d7a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1708,6 +1708,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; + if (res > 0 && write && fopen_direct_io) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, idx_from, idx_to); + } + return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); From 0dc14f016c2ab84472c8bde11a0e98b0aa8d0768 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 22 Oct 2025 23:01:27 +0200 Subject: [PATCH 28/47] fuse: Always flush the page cache before FOPEN_DIRECT_IO write This was done as condition on direct_io_allow_mmap, but I believe this is not right, as a file might be open two times - once with write-back enabled another time with FOPEN_DIRECT_IO. Signed-off-by: Bernd Schubert (imported from commit dfca33861c21aa0bdab22366154d8488fa82d96f) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3c22e9ca507d7a..bd729e818e55c1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1634,7 +1634,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_allow_mmap) { + if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); From 7e7ef5149603e4ccbe50d0ff4e5ddd018732efdd Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 10 Nov 2025 13:17:38 +0100 Subject: [PATCH 29/47] fuse: Fetch a queued fuse request on command registration With the reduced queue feature io-uring is marked as ready after receiving the 1st ring entry. At this time other queues just might be in the process of registration and then a race happens fuse_uring_queue_fuse_req -> no queue entry registered yet list_add_tail -> fuse request gets queued So far fetching requests from the list only happened from FUSE_IO_URING_CMD_COMMIT_AND_FETCH, but without new requests on the same queue, it would actually never send requests from that queue - the request was stuck. (imported from commit 3bfb6cdc9b978a13eab59ebae592ddfa225c4c4a) --- fs/fuse/dev_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 6fe552e7bf9558..ab5373548acc26 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1418,6 +1418,8 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; } From c36239bcee1830ce6b06331451e0b8f632828ead Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Tue, 16 Sep 2025 13:31:45 +0200 Subject: [PATCH 30/47] fuse: add compound command to combine multiple requests fuse.h: add new opcode FUSE_COMPOUND fuse_compound.c: add new functionality to pack multiple fuse operations into one compound command file.c: add an implementation of open+getattr Signed-off-by: Horst Birthelmer (imported from commit d9e735140a3faccbe5786a7e75a4ad9a6a9aa2e0) --- fs/fuse/Makefile | 2 +- fs/fuse/compound.c | 432 ++++++++++++++++++++++++++++++++++++++ fs/fuse/file.c | 127 +++++++++-- fs/fuse/fuse_i.h | 19 +- fs/fuse/inode.c | 6 + fs/fuse/ioctl.c | 2 +- include/uapi/linux/fuse.h | 38 ++++ 7 files changed, 602 insertions(+), 24 deletions(-) create mode 100644 fs/fuse/compound.c diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index d55e0e622e123b..f54c504ca6637c 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o compound.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c new file mode 100644 index 00000000000000..3758b4666366ea --- /dev/null +++ b/fs/fuse/compound.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2025 + * + * This file implements compound operations for FUSE, allowing multiple + * operations to be batched into a single request to reduce round trips + * between kernel and userspace. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Compound request + */ +struct fuse_compound_req +{ + struct fuse_mount *fm; + struct fuse_compound_in compound_header; + struct fuse_compound_out result_header; + + size_t total_size; + char *buffer; + size_t buffer_pos; + size_t buffer_size; + + size_t total_expected_out_size; + + /* Operation results for error tracking */ + int op_errors[FUSE_MAX_COMPOUND_OPS]; + struct fuse_args *op_args[FUSE_MAX_COMPOUND_OPS]; + + /* Parsing state to avoid double processing */ + bool parsed; +}; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, + uint32_t flags) +{ + struct fuse_compound_req *compound; + + compound = kzalloc(sizeof(*compound), GFP_KERNEL); + if (!compound) + return ERR_PTR(-ENOMEM); + + compound->fm = fm; + compound->compound_header.flags = flags; + compound->buffer_size = PAGE_SIZE; + compound->buffer = kvmalloc(compound->buffer_size, GFP_KERNEL); + if (!compound->buffer) { + kfree(compound); + return ERR_PTR(-ENOMEM); + } + return compound; +} + +/* + * Free compound request resources + */ +void fuse_compound_free(struct fuse_compound_req *compound) +{ + if (compound) { + kvfree(compound->buffer); + kfree(compound); + } +} + +/* + * Validate compound request structure before sending it out. + * Returns 0 on success, negative error code on failure. + */ +static int fuse_compound_validate_header(struct fuse_compound_req *compound) +{ + struct fuse_compound_in *in_header = &compound->compound_header; + size_t offset = 0; + int i; + + if (compound->buffer_pos > compound->buffer_size) + return -EINVAL; + + if (!compound || !compound->buffer) + return -EINVAL; + + if (compound->buffer_pos < sizeof(struct fuse_in_header)) + return -EINVAL; + + if (in_header->count == 0 || in_header->count > FUSE_MAX_COMPOUND_OPS) + return -EINVAL; + + for (i = 0; i < in_header->count; i++) { + const struct fuse_in_header *op_hdr; + + if (offset + sizeof(struct fuse_in_header) > compound->buffer_pos) { + pr_info_ratelimited("FUSE: compound operation %d header extends beyond buffer (offset %zu + header size %zu > buffer pos %zu)\n", + i, offset, sizeof(struct fuse_in_header), compound->buffer_pos); + return -EINVAL; + } + + op_hdr = (const struct fuse_in_header *)(compound->buffer + offset); + + if (op_hdr->len < sizeof(struct fuse_in_header)) { + pr_info_ratelimited("FUSE: compound operation %d has invalid length %u (minimum %zu bytes)\n", + i, op_hdr->len, sizeof(struct fuse_in_header)); + return -EINVAL; + } + + if (offset + op_hdr->len > compound->buffer_pos) { + pr_info_ratelimited("FUSE: compound operation %d extends beyond buffer (offset %zu + length %u > buffer pos %zu)\n", + i, offset, op_hdr->len, compound->buffer_pos); + return -EINVAL; + } + + if (op_hdr->opcode == 0 || op_hdr->opcode == FUSE_COMPOUND) { + pr_info_ratelimited("FUSE: compound operation %d has invalid opcode %u (cannot be 0 or FUSE_COMPOUND)\n", + i, op_hdr->opcode); + return -EINVAL; + } + + if (op_hdr->nodeid == 0) { + pr_info_ratelimited("FUSE: compound operation %d has invalid node ID 0\n", i); + return -EINVAL; + } + + offset += op_hdr->len; + } + + if (offset != compound->buffer_pos) { + pr_info_ratelimited("FUSE: compound buffer size mismatch (calculated %zu bytes, actual %zu bytes)\n", + offset, compound->buffer_pos); + return -EINVAL; + } + + return 0; +} + +/* + * Adds a single operation to the compound request. The operation is serialized + * into the request buffer with its own fuse_in_header. + * + * For operations with page-based payloads (in_pages=true), the page data is + * ignored at the moment. + * + * Returns 0 on success, negative error code on failure. + */ +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args) +{ + struct fuse_in_header *hdr; + size_t args_size = 0; + size_t needed_size; + size_t expected_out_size = 0; + size_t page_payload_size = 0; + int i; + + if (!compound || compound->compound_header.count >= FUSE_MAX_COMPOUND_OPS) + return -EINVAL; + + /* Calculate input size - handle page-based arguments separately */ + for (i = 0; i < args->in_numargs; i++) { + /* Last argument with in_pages flag gets data from pages */ + if (unlikely(i == args->in_numargs - 1 && args->in_pages)) { + /* the data handling is not supported at the moment */ + page_payload_size = args->in_args[i].size; + args_size += page_payload_size; + } else { + args_size += args->in_args[i].size; + } + } + + /* Calculate expected output size */ + for (i = 0; i < args->out_numargs; i++) + expected_out_size += args->out_args[i].size; + + needed_size = sizeof(struct fuse_in_header) + args_size; + + /* Expand buffer if needed */ + if (compound->buffer_pos + needed_size > compound->buffer_size) { + size_t new_size = max(compound->buffer_size * 2, + compound->buffer_pos + needed_size); + new_size = round_up(new_size, PAGE_SIZE); + char *new_buffer = kvrealloc(compound->buffer, + compound->buffer_size, + new_size, GFP_KERNEL); + if (!new_buffer) + return -ENOMEM; + compound->buffer = new_buffer; + compound->buffer_size = new_size; + } + + /* Build request header */ + hdr = (struct fuse_in_header *)(compound->buffer + compound->buffer_pos); + memset(hdr, 0, sizeof(*hdr)); + hdr->len = needed_size; + hdr->opcode = args->opcode; + hdr->nodeid = args->nodeid; + hdr->uid = from_kuid(compound->fm->fc->user_ns, current_fsuid()); + hdr->gid = from_kgid(compound->fm->fc->user_ns, current_fsgid()); + hdr->pid = pid_nr_ns(task_pid(current), compound->fm->fc->pid_ns); + hdr->unique = fuse_get_unique(&compound->fm->fc->iq); + compound->buffer_pos += sizeof(*hdr); + + if (args->in_pages) { + /* we have external payload, + * this is not supported at the moment */ + return -EINVAL; + } + + /* Copy operation arguments */ + for (i = 0; i < args->in_numargs; i++) { + memcpy(compound->buffer + compound->buffer_pos, + args->in_args[i].value, args->in_args[i].size); + compound->buffer_pos += args->in_args[i].size; + } + + compound->total_expected_out_size += expected_out_size; + + /* Store args for response parsing */ + compound->op_args[compound->compound_header.count] = args; + + compound->compound_header.count++; + compound->total_size += needed_size; + + return 0; +} + +/* + * Copy response data to fuse_args structure + * + * Returns 0 on success, negative error code on failure. + */ +static void *fuse_copy_response_data(struct fuse_args *args, char *response_data) +{ + size_t copied = 0; + int arg_idx; + + for (arg_idx = 0; arg_idx < args->out_numargs; arg_idx++) { + struct fuse_arg current_arg = args->out_args[arg_idx]; + + /* Last argument with out_pages: copy to pages */ + if (arg_idx == args->out_numargs - 1 && args->out_pages) { + /* external payload (in the last out arg) + * is not supported at the moment + */ + return response_data; + } else { + size_t arg_size = current_arg.size; + if (current_arg.value && arg_size > 0) { + memcpy(current_arg.value, + (char *)response_data + copied, + arg_size); + copied += arg_size; + } + } + } + + return (char*)response_data + copied; +} + +int fuse_compound_get_error(struct fuse_compound_req * compound, + int op_idx) +{ + return compound->op_errors[op_idx]; +} + +/* + * Parse compound response + * + * Parses the compound response and populates the original + * fuse_args structures with the response data. This function is idempotent + * and can be called multiple times safely. + * + * For operations with page-based output (out_pages=true), the response data + * is ignored at the moment. + * + * Returns 0 on success, negative error code on failure. + */ +static int fuse_compound_parse_resp(struct fuse_compound_req *compound, + uint32_t count, void *response, size_t response_size) +{ + int i; + int res = 0; + + /* double parsing prevention will be important + * for large responses most likely out pages. + */ + if (compound->parsed) { + return 0; + } + + void *op_out_data = response; + void *response_end = (char *)response + response_size; + + /* Basic validation */ + if (!response || response_size < sizeof(struct fuse_out_header)) { + return -EIO; + } + + /* Parse each operation response */ + for (i = 0; + i < count && i < compound->result_header.count; i++) { + struct fuse_out_header *op_hdr = op_out_data; + struct fuse_args *args = compound->op_args[i]; + + /* Validate header length */ + if (op_hdr->len < sizeof(struct fuse_out_header)) { + return -EIO; + } + + /* Check if the entire operation response fits in the buffer */ + if ((char *)op_out_data + op_hdr->len > (char *)response_end) { + return -EIO; + } + + if (op_hdr->error != 0) { + compound->op_errors[i] = op_hdr->error; + } + + /* Copy response data */ + if (args && op_hdr->len > sizeof(struct fuse_out_header)) { + op_out_data = fuse_copy_response_data(args, + op_out_data + sizeof(struct fuse_out_header)); + } else { + /* No response data, just advance past the header */ + op_out_data = (char *)op_out_data + op_hdr->len; + } + } + + compound->parsed = true; + return res; +} + +/* + * Send compound request to userspace + * + * Sends the compound request out and parses the response. + * + * -> in_arg[0] -> fuse_compound_in (containing mainly count and flags) + * -> in_arg[1] -> payload + * (containing the serialized requests created by fuse_compound_add) + * + * On success, the response data is copied to the original fuse_args + * structures for each operation. + * + * Returns 0 on success, or the first error code from any operation. + * Returns negative error code if the request itself fails. + */ +ssize_t fuse_compound_send(struct fuse_compound_req *compound) +{ + size_t expected_response_size; + ssize_t ret; + struct fuse_args args = { + .opcode = FUSE_COMPOUND, + .nodeid = 0, + .in_numargs = 2, + .out_numargs = 2, + .out_argvar = true, + }; + + if (!compound) { + pr_info_ratelimited("FUSE: compound request is NULL in fuse_compound_send\n"); + return -EINVAL; + } + + if (compound->compound_header.count == 0) { + pr_info_ratelimited("FUSE: compound request contains no operations\n"); + return -EINVAL; + } + + /* Calculate response buffer size */ + expected_response_size = + compound->total_expected_out_size; + size_t total_buffer_size = expected_response_size + + (compound->compound_header.count * sizeof(struct fuse_out_header)); + + void *resp_payload = kvmalloc(total_buffer_size, GFP_KERNEL | __GFP_ZERO); + + if (!resp_payload) + return -ENOMEM; + /* tell the fuse server how much memory we have allocated */ + compound->compound_header.result_size = expected_response_size; + + args.in_args[0].size = sizeof(compound->compound_header); + args.in_args[0].value = &compound->compound_header; + args.in_args[1].size = compound->buffer_pos; + args.in_args[1].value = compound->buffer; + + args.out_args[0].size = sizeof(compound->result_header); + args.out_args[0].value = &compound->result_header; + args.out_args[1].size = total_buffer_size; + args.out_args[1].value = resp_payload; + + /* Validate request */ + ret = fuse_compound_validate_header(compound); + if (ret) + goto out; + + ret = fuse_compound_request(compound->fm, &args); + if (ret == -ENOSYS) { + goto out; + } + + size_t actual_response_size = args.out_args[1].size; + + /* Validate response size */ + if (actual_response_size < sizeof(struct fuse_compound_out)) { + pr_info_ratelimited("FUSE: compound response too small (%zu bytes, minimum %zu bytes)\n", + actual_response_size, sizeof(struct fuse_compound_out)); + ret = -EINVAL; + goto out; + } + + /* Parse response using actual size */ + ret = fuse_compound_parse_resp(compound, + compound->result_header.count, + ((char *)resp_payload), + actual_response_size); +out: + kvfree(resp_payload); + return ret; +} diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bd729e818e55c1..d50b4f12ee4297 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -125,8 +125,76 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) } } +static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, int flags, + int opcode, struct fuse_file *ff, struct fuse_attr_out *out_attr) +{ + struct fuse_compound_req *compound; + struct fuse_args open_args = {}, getattr_args = {}; + struct fuse_open_in open_in = {}; + struct fuse_getattr_in getattr_in = {}; + struct fuse_open_out open_out; + struct fuse_attr_out attr_out; + int err; + + /* Build compound request with flag to execute in the given order */ + compound = fuse_compound_alloc(fm, 0); + if (IS_ERR(compound)) + return PTR_ERR(compound); + + /* Add OPEN */ + open_in.flags = flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) + open_in.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (open_in.flags & O_TRUNC) && !capable(CAP_FSETID)) { + open_in.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + open_args.opcode = opcode; + open_args.nodeid = nodeid; + open_args.in_numargs = 1; + open_args.in_args[0].size = sizeof(open_in); + open_args.in_args[0].value = &open_in; + open_args.out_numargs = 1; + open_args.out_args[0].size = sizeof(struct fuse_open_out); + open_args.out_args[0].value = &open_out; + + err = fuse_compound_add(compound, &open_args); + if (err) + goto out; + + /* Add GETATTR */ + getattr_args.opcode = FUSE_GETATTR; + getattr_args.nodeid = nodeid; + getattr_args.in_numargs = 1; + getattr_args.in_args[0].size = sizeof(getattr_in); + getattr_args.in_args[0].value = &getattr_in; + getattr_args.out_numargs = 1; + getattr_args.out_args[0].size = sizeof(struct fuse_attr_out); + getattr_args.out_args[0].value = &attr_out; + + err = fuse_compound_add(compound, &getattr_args); + if (err) + goto out; + + err = fuse_compound_send(compound); + if (err) + goto out; + + ff->fh = open_out.fh; + ff->open_flags = open_out.open_flags; + + if (out_attr) + *out_attr = attr_out; + +out: + fuse_compound_free(compound); + return err; +} + struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir) + struct inode *inode, + unsigned int open_flags, bool isdir) { struct fuse_conn *fc = fm->fc; struct fuse_file *ff; @@ -141,25 +209,43 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); if (open) { - /* Store outarg for fuse_finish_open() */ - struct fuse_open_out *outargp = &ff->args->open_outarg; - int err; + struct fuse_open_out outarg; + int err = -ENOSYS; + + if (inode && fc->compound_open_getattr) { + struct fuse_attr_out attr_outarg; + err = fuse_compound_open_getattr(fm, nodeid, open_flags, + opcode, ff, &attr_outarg); + if (!err) + fuse_change_attributes(inode, &attr_outarg.attr, NULL, + ATTR_TIMEOUT(&attr_outarg), + fuse_get_attr_version(fc)); + } + if (err == -ENOSYS) { + err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); - err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); - if (!err) { - ff->fh = outargp->fh; - ff->open_flags = outargp->open_flags; - } else if (err != -ENOSYS) { - fuse_file_free(ff); - return ERR_PTR(err); - } else { - /* No release needed */ - kfree(ff->args); - ff->args = NULL; - if (isdir) - fc->no_opendir = 1; - else - fc->no_open = 1; + if (!err) { + ff->fh = outarg.fh; + ff->open_flags = outarg.open_flags; + } + } + + if (err) { + if(err != -ENOSYS) { + /* err is not ENOSYS */ + fuse_file_free(ff); + return ERR_PTR(err); + } else { + /* No release needed */ + kfree(ff->release_args); + ff->release_args = NULL; + + /* we don't have open */ + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; + } } } @@ -174,11 +260,10 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { - struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + struct fuse_file *ff = fuse_file_open(fm, nodeid, file_inode(file), file->f_flags, isdir); if (!IS_ERR(ff)) file->private_data = ff; - return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 63731c853c7e50..4a841d27a9db8f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -931,6 +931,9 @@ struct fuse_conn { /** Maximum stack depth for passthrough backing files */ int max_stack_depth; + /* Does the filesystem support compound operations? */ + unsigned int compound_open_getattr:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -1282,6 +1285,19 @@ static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); +/** + * Compound request API + */ +struct fuse_compound_req; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, uint32_t flags); +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args); +ssize_t fuse_compound_send(struct fuse_compound_req *compound); +int fuse_compound_get_error(struct fuse_compound_req * compound, + int op_idx); +void fuse_compound_free(struct fuse_compound_req *compound); + /** * End a finished request */ @@ -1542,7 +1558,8 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); /* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir); + struct inode *inode, + unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c23eb6531bdcb4..bb778cd17d46f8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1036,6 +1036,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; fc->dlm = 1; + + /* pretend fuse server supports compound operations + * until it tells us otherwise. + */ + fc->compound_open_getattr = 1; + atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fdc175e93f7474..07a02e47b2c3a6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -494,7 +494,7 @@ static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); - return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); + return fuse_file_open(fm, get_node_id(inode), NULL, O_RDONLY, isdir); } static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 40e26460d69856..d22a0e3b9c0e39 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -672,6 +672,13 @@ enum fuse_opcode { /* Operations which have not been merged into upstream */ FUSE_DLM_WB_LOCK = 100, + /* A compound request works like multiple simple requests. + * This is a special case for calls that can be combined atomic on the + * fuse server. If the server actually does atomically execute the command is + * left to the fuse server implementation. + */ + FUSE_COMPOUND = 101, + /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1271,6 +1278,7 @@ struct fuse_dlm_lock_in { uint32_t reserved; }; + /** * struct fuse_dlm_lock_out - Lock response * @locksize: how many bytes where locked by the call @@ -1283,6 +1291,36 @@ struct fuse_dlm_lock_out { uint64_t reserved; }; +/* + * Compound request header + * + * This header is followed by the fuse requests + */ +struct fuse_compound_in { + uint32_t count; /* Number of operations */ + uint32_t flags; /* Compound flags */ + + /* Total size of all results. + * This is needed for preallocating the whole result for all + * commands in this compound. + */ + uint32_t result_size; + uint64_t reserved; +}; + +/* + * Compound response header + * + * This header is followed by complete fuse responses + */ +struct fuse_compound_out { + uint32_t count; /* Number of results */ + uint32_t flags; /* Result flags */ + uint64_t reserved; +}; + +#define FUSE_MAX_COMPOUND_OPS 16 /* Maximum operations per compound */ + /** * Size of the ring buffer header */ From 35caf14e2d00952f1adce404b9363d96ee53e4df Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Thu, 8 Jan 2026 20:22:06 +0100 Subject: [PATCH 31/47] fuse: simplify compound commands Simplify fuse_compound_req to hold only the pointers to the added fuse args and the request housekeeping. Simplify open+getattr call by using helper functions to fill out the fuse request parameters Signed-off-by: Horst Birthelmer (imported from commit 1607a03696693c4ceef7a61adf5759748a7ca9b0) (imported from commit 9df5e4cb96184aae03d7d49131b59a4767641d6b) (imported from commit 9921bcdc4e126a7606e036b04893a6bfd36b8c75) (imported from commit 09d6f59e98090b4de35bfe5344fd1ca5559d1c16) --- fs/fuse/compound.c | 445 ++++++++++++++------------------------------- fs/fuse/dir.c | 9 +- fs/fuse/file.c | 119 +++++++----- fs/fuse/fuse_i.h | 14 +- 4 files changed, 220 insertions(+), 367 deletions(-) diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c index 3758b4666366ea..bc52e22eff3123 100644 --- a/fs/fuse/compound.c +++ b/fs/fuse/compound.c @@ -23,31 +23,19 @@ #include /* - * Compound request + * Compound request builder and state tracker and args pointer storage */ -struct fuse_compound_req -{ +struct fuse_compound_req { struct fuse_mount *fm; struct fuse_compound_in compound_header; struct fuse_compound_out result_header; - size_t total_size; - char *buffer; - size_t buffer_pos; - size_t buffer_size; - - size_t total_expected_out_size; - - /* Operation results for error tracking */ + /* Per-operation error codes */ int op_errors[FUSE_MAX_COMPOUND_OPS]; struct fuse_args *op_args[FUSE_MAX_COMPOUND_OPS]; - - /* Parsing state to avoid double processing */ - bool parsed; }; -struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, - uint32_t flags) +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, u32 flags) { struct fuse_compound_req *compound; @@ -57,309 +45,99 @@ struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, compound->fm = fm; compound->compound_header.flags = flags; - compound->buffer_size = PAGE_SIZE; - compound->buffer = kvmalloc(compound->buffer_size, GFP_KERNEL); - if (!compound->buffer) { - kfree(compound); - return ERR_PTR(-ENOMEM); - } - return compound; -} - -/* - * Free compound request resources - */ -void fuse_compound_free(struct fuse_compound_req *compound) -{ - if (compound) { - kvfree(compound->buffer); - kfree(compound); - } -} - -/* - * Validate compound request structure before sending it out. - * Returns 0 on success, negative error code on failure. - */ -static int fuse_compound_validate_header(struct fuse_compound_req *compound) -{ - struct fuse_compound_in *in_header = &compound->compound_header; - size_t offset = 0; - int i; - - if (compound->buffer_pos > compound->buffer_size) - return -EINVAL; - - if (!compound || !compound->buffer) - return -EINVAL; - - if (compound->buffer_pos < sizeof(struct fuse_in_header)) - return -EINVAL; - - if (in_header->count == 0 || in_header->count > FUSE_MAX_COMPOUND_OPS) - return -EINVAL; - - for (i = 0; i < in_header->count; i++) { - const struct fuse_in_header *op_hdr; - - if (offset + sizeof(struct fuse_in_header) > compound->buffer_pos) { - pr_info_ratelimited("FUSE: compound operation %d header extends beyond buffer (offset %zu + header size %zu > buffer pos %zu)\n", - i, offset, sizeof(struct fuse_in_header), compound->buffer_pos); - return -EINVAL; - } - - op_hdr = (const struct fuse_in_header *)(compound->buffer + offset); - - if (op_hdr->len < sizeof(struct fuse_in_header)) { - pr_info_ratelimited("FUSE: compound operation %d has invalid length %u (minimum %zu bytes)\n", - i, op_hdr->len, sizeof(struct fuse_in_header)); - return -EINVAL; - } - - if (offset + op_hdr->len > compound->buffer_pos) { - pr_info_ratelimited("FUSE: compound operation %d extends beyond buffer (offset %zu + length %u > buffer pos %zu)\n", - i, offset, op_hdr->len, compound->buffer_pos); - return -EINVAL; - } - - if (op_hdr->opcode == 0 || op_hdr->opcode == FUSE_COMPOUND) { - pr_info_ratelimited("FUSE: compound operation %d has invalid opcode %u (cannot be 0 or FUSE_COMPOUND)\n", - i, op_hdr->opcode); - return -EINVAL; - } - if (op_hdr->nodeid == 0) { - pr_info_ratelimited("FUSE: compound operation %d has invalid node ID 0\n", i); - return -EINVAL; - } - - offset += op_hdr->len; - } - - if (offset != compound->buffer_pos) { - pr_info_ratelimited("FUSE: compound buffer size mismatch (calculated %zu bytes, actual %zu bytes)\n", - offset, compound->buffer_pos); - return -EINVAL; - } - - return 0; + return compound; } -/* - * Adds a single operation to the compound request. The operation is serialized - * into the request buffer with its own fuse_in_header. - * - * For operations with page-based payloads (in_pages=true), the page data is - * ignored at the moment. - * - * Returns 0 on success, negative error code on failure. - */ int fuse_compound_add(struct fuse_compound_req *compound, struct fuse_args *args) { - struct fuse_in_header *hdr; - size_t args_size = 0; - size_t needed_size; - size_t expected_out_size = 0; - size_t page_payload_size = 0; - int i; - - if (!compound || compound->compound_header.count >= FUSE_MAX_COMPOUND_OPS) + if (!compound || + compound->compound_header.count >= FUSE_MAX_COMPOUND_OPS) return -EINVAL; - /* Calculate input size - handle page-based arguments separately */ - for (i = 0; i < args->in_numargs; i++) { - /* Last argument with in_pages flag gets data from pages */ - if (unlikely(i == args->in_numargs - 1 && args->in_pages)) { - /* the data handling is not supported at the moment */ - page_payload_size = args->in_args[i].size; - args_size += page_payload_size; - } else { - args_size += args->in_args[i].size; - } - } - - /* Calculate expected output size */ - for (i = 0; i < args->out_numargs; i++) - expected_out_size += args->out_args[i].size; - - needed_size = sizeof(struct fuse_in_header) + args_size; - - /* Expand buffer if needed */ - if (compound->buffer_pos + needed_size > compound->buffer_size) { - size_t new_size = max(compound->buffer_size * 2, - compound->buffer_pos + needed_size); - new_size = round_up(new_size, PAGE_SIZE); - char *new_buffer = kvrealloc(compound->buffer, - compound->buffer_size, - new_size, GFP_KERNEL); - if (!new_buffer) - return -ENOMEM; - compound->buffer = new_buffer; - compound->buffer_size = new_size; - } - - /* Build request header */ - hdr = (struct fuse_in_header *)(compound->buffer + compound->buffer_pos); - memset(hdr, 0, sizeof(*hdr)); - hdr->len = needed_size; - hdr->opcode = args->opcode; - hdr->nodeid = args->nodeid; - hdr->uid = from_kuid(compound->fm->fc->user_ns, current_fsuid()); - hdr->gid = from_kgid(compound->fm->fc->user_ns, current_fsgid()); - hdr->pid = pid_nr_ns(task_pid(current), compound->fm->fc->pid_ns); - hdr->unique = fuse_get_unique(&compound->fm->fc->iq); - compound->buffer_pos += sizeof(*hdr); - - if (args->in_pages) { - /* we have external payload, - * this is not supported at the moment */ + if (args->in_pages) return -EINVAL; - } - - /* Copy operation arguments */ - for (i = 0; i < args->in_numargs; i++) { - memcpy(compound->buffer + compound->buffer_pos, - args->in_args[i].value, args->in_args[i].size); - compound->buffer_pos += args->in_args[i].size; - } - compound->total_expected_out_size += expected_out_size; - - /* Store args for response parsing */ compound->op_args[compound->compound_header.count] = args; - compound->compound_header.count++; - compound->total_size += needed_size; - return 0; } -/* - * Copy response data to fuse_args structure - * - * Returns 0 on success, negative error code on failure. - */ -static void *fuse_copy_response_data(struct fuse_args *args, char *response_data) +static void *fuse_copy_response_per_req(struct fuse_args *args, + char *resp) { + int i; size_t copied = 0; - int arg_idx; - - for (arg_idx = 0; arg_idx < args->out_numargs; arg_idx++) { - struct fuse_arg current_arg = args->out_args[arg_idx]; - - /* Last argument with out_pages: copy to pages */ - if (arg_idx == args->out_numargs - 1 && args->out_pages) { - /* external payload (in the last out arg) - * is not supported at the moment - */ - return response_data; - } else { - size_t arg_size = current_arg.size; - if (current_arg.value && arg_size > 0) { - memcpy(current_arg.value, - (char *)response_data + copied, - arg_size); - copied += arg_size; - } + + for (i = 0; i < args->out_numargs; i++) { + struct fuse_arg current_arg = args->out_args[i]; + size_t arg_size = current_arg.size; + + if (current_arg.value && arg_size > 0) { + memcpy(current_arg.value, + (char *)resp + copied, arg_size); + copied += arg_size; } } - return (char*)response_data + copied; + return (char *)resp + copied; } -int fuse_compound_get_error(struct fuse_compound_req * compound, - int op_idx) +int fuse_compound_get_error(struct fuse_compound_req *compound, int op_idx) { return compound->op_errors[op_idx]; } -/* - * Parse compound response - * - * Parses the compound response and populates the original - * fuse_args structures with the response data. This function is idempotent - * and can be called multiple times safely. - * - * For operations with page-based output (out_pages=true), the response data - * is ignored at the moment. - * - * Returns 0 on success, negative error code on failure. - */ -static int fuse_compound_parse_resp(struct fuse_compound_req *compound, - uint32_t count, void *response, size_t response_size) +static void *fuse_compound_parse_one_op(struct fuse_compound_req *compound, + int op_index, void *op_out_data, + void *response_end) { - int i; - int res = 0; + struct fuse_out_header *op_hdr = op_out_data; + struct fuse_args *args = compound->op_args[op_index]; - /* double parsing prevention will be important - * for large responses most likely out pages. - */ - if (compound->parsed) { - return 0; - } + if (op_hdr->len < sizeof(struct fuse_out_header)) + return NULL; - void *op_out_data = response; - void *response_end = (char *)response + response_size; + /* Check if the entire operation response fits in the buffer */ + if ((char *)op_out_data + op_hdr->len > (char *)response_end) + return NULL; - /* Basic validation */ - if (!response || response_size < sizeof(struct fuse_out_header)) { - return -EIO; - } + if (op_hdr->error != 0) + compound->op_errors[op_index] = op_hdr->error; - /* Parse each operation response */ - for (i = 0; - i < count && i < compound->result_header.count; i++) { - struct fuse_out_header *op_hdr = op_out_data; - struct fuse_args *args = compound->op_args[i]; + if (args && op_hdr->len > sizeof(struct fuse_out_header)) + return fuse_copy_response_per_req(args, op_out_data + + sizeof(struct fuse_out_header)); - /* Validate header length */ - if (op_hdr->len < sizeof(struct fuse_out_header)) { - return -EIO; - } + /* No response data, just advance past the header */ + return (char *)op_out_data + op_hdr->len; +} - /* Check if the entire operation response fits in the buffer */ - if ((char *)op_out_data + op_hdr->len > (char *)response_end) { - return -EIO; - } +static int fuse_compound_parse_resp(struct fuse_compound_req *compound, + u32 count, void *response, + size_t response_size) +{ + void *op_out_data = response; + void *response_end = (char *)response + response_size; + int i; - if (op_hdr->error != 0) { - compound->op_errors[i] = op_hdr->error; - } + if (!response || response_size < sizeof(struct fuse_out_header)) + return -EIO; - /* Copy response data */ - if (args && op_hdr->len > sizeof(struct fuse_out_header)) { - op_out_data = fuse_copy_response_data(args, - op_out_data + sizeof(struct fuse_out_header)); - } else { - /* No response data, just advance past the header */ - op_out_data = (char *)op_out_data + op_hdr->len; - } + for (i = 0; i < count && i < compound->result_header.count; i++) { + op_out_data = fuse_compound_parse_one_op(compound, i, + op_out_data, + response_end); + if (!op_out_data) + return -EIO; } - compound->parsed = true; - return res; + return 0; } -/* - * Send compound request to userspace - * - * Sends the compound request out and parses the response. - * - * -> in_arg[0] -> fuse_compound_in (containing mainly count and flags) - * -> in_arg[1] -> payload - * (containing the serialized requests created by fuse_compound_add) - * - * On success, the response data is copied to the original fuse_args - * structures for each operation. - * - * Returns 0 on success, or the first error code from any operation. - * Returns negative error code if the request itself fails. - */ ssize_t fuse_compound_send(struct fuse_compound_req *compound) { - size_t expected_response_size; - ssize_t ret; struct fuse_args args = { .opcode = FUSE_COMPOUND, .nodeid = 0, @@ -367,9 +145,18 @@ ssize_t fuse_compound_send(struct fuse_compound_req *compound) .out_numargs = 2, .out_argvar = true, }; + size_t resp_buffer_size; + size_t actual_response_size; + size_t buffer_pos; + size_t total_expected_out_size; + void *buffer = NULL; + void *resp_payload; + ssize_t ret; + int i; if (!compound) { - pr_info_ratelimited("FUSE: compound request is NULL in fuse_compound_send\n"); + pr_info_ratelimited("FUSE: compound request is NULL in %s\n", + __func__); return -EINVAL; } @@ -378,55 +165,99 @@ ssize_t fuse_compound_send(struct fuse_compound_req *compound) return -EINVAL; } - /* Calculate response buffer size */ - expected_response_size = - compound->total_expected_out_size; - size_t total_buffer_size = expected_response_size + - (compound->compound_header.count * sizeof(struct fuse_out_header)); + buffer_pos = 0; + total_expected_out_size = 0; + + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + buffer_pos += needed_size; - void *resp_payload = kvmalloc(total_buffer_size, GFP_KERNEL | __GFP_ZERO); + for (j = 0; j < op_args->out_numargs; j++) + total_expected_out_size += op_args->out_args[j].size; + } - if (!resp_payload) + buffer = kvmalloc(buffer_pos, GFP_KERNEL); + if (!buffer) return -ENOMEM; - /* tell the fuse server how much memory we have allocated */ - compound->compound_header.result_size = expected_response_size; + + buffer_pos = 0; + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + struct fuse_in_header *hdr; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + hdr = (struct fuse_in_header *)(buffer + buffer_pos); + memset(hdr, 0, sizeof(*hdr)); + hdr->len = needed_size; + hdr->opcode = op_args->opcode; + hdr->nodeid = op_args->nodeid; + hdr->uid = from_kuid(compound->fm->fc->user_ns, + current_fsuid()); + hdr->gid = from_kgid(compound->fm->fc->user_ns, + current_fsgid()); + hdr->pid = pid_nr_ns(task_pid(current), + compound->fm->fc->pid_ns); + buffer_pos += sizeof(*hdr); + + for (j = 0; j < op_args->in_numargs; j++) { + memcpy(buffer + buffer_pos, op_args->in_args[j].value, + op_args->in_args[j].size); + buffer_pos += op_args->in_args[j].size; + } + } + + resp_buffer_size = total_expected_out_size + + (compound->compound_header.count * + sizeof(struct fuse_out_header)); + + resp_payload = kvmalloc(resp_buffer_size, GFP_KERNEL | __GFP_ZERO); + if (!resp_payload) { + ret = -ENOMEM; + goto out_free_buffer; + } + + compound->compound_header.result_size = total_expected_out_size; args.in_args[0].size = sizeof(compound->compound_header); args.in_args[0].value = &compound->compound_header; - args.in_args[1].size = compound->buffer_pos; - args.in_args[1].value = compound->buffer; + args.in_args[1].size = buffer_pos; + args.in_args[1].value = buffer; args.out_args[0].size = sizeof(compound->result_header); args.out_args[0].value = &compound->result_header; - args.out_args[1].size = total_buffer_size; + args.out_args[1].size = resp_buffer_size; args.out_args[1].value = resp_payload; - /* Validate request */ - ret = fuse_compound_validate_header(compound); - if (ret) + ret = fuse_simple_request(compound->fm, &args); + if (ret < 0) goto out; - ret = fuse_compound_request(compound->fm, &args); - if (ret == -ENOSYS) { - goto out; - } - - size_t actual_response_size = args.out_args[1].size; + actual_response_size = args.out_args[1].size; - /* Validate response size */ if (actual_response_size < sizeof(struct fuse_compound_out)) { pr_info_ratelimited("FUSE: compound response too small (%zu bytes, minimum %zu bytes)\n", - actual_response_size, sizeof(struct fuse_compound_out)); + actual_response_size, + sizeof(struct fuse_compound_out)); ret = -EINVAL; goto out; } - /* Parse response using actual size */ - ret = fuse_compound_parse_resp(compound, - compound->result_header.count, - ((char *)resp_payload), - actual_response_size); + ret = fuse_compound_parse_resp(compound, compound->result_header.count, + (char *)resp_payload, + actual_response_size); out: kvfree(resp_payload); +out_free_buffer: + kvfree(buffer); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f678659bf44a04..17f9dac2377bbe 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1307,14 +1307,7 @@ static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.opcode = FUSE_GETATTR; - args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + fuse_getattr_args_fill(&args, get_node_id(inode), &inarg, &outarg); err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d50b4f12ee4297..373dec9381b2fa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -24,6 +24,39 @@ #include #include +/* + * Helper function to initialize fuse_args for OPEN/OPENDIR operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg) +{ + args->opcode = opcode; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + +/* + * Helper function to initialize fuse_args for GETATTR operations + */ +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg) +{ + args->opcode = FUSE_GETATTR; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, struct fuse_open_out *outargp) @@ -41,14 +74,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = opcode; - args.nodeid = nodeid; - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(*outargp); - args.out_args[0].value = outargp; + fuse_open_args_fill(&args, nodeid, opcode, &inarg, outargp); return fuse_simple_request(fm, &args); } @@ -125,53 +151,38 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) } } -static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, int flags, - int opcode, struct fuse_file *ff, struct fuse_attr_out *out_attr) +static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, + int flags, int opcode, + struct fuse_file *ff, + struct fuse_attr_out *outattrp, + struct fuse_open_out *outopenp) { struct fuse_compound_req *compound; - struct fuse_args open_args = {}, getattr_args = {}; + struct fuse_args open_args = {}; + struct fuse_args getattr_args = {}; struct fuse_open_in open_in = {}; struct fuse_getattr_in getattr_in = {}; - struct fuse_open_out open_out; - struct fuse_attr_out attr_out; int err; - /* Build compound request with flag to execute in the given order */ compound = fuse_compound_alloc(fm, 0); if (IS_ERR(compound)) return PTR_ERR(compound); - /* Add OPEN */ open_in.flags = flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fm->fc->atomic_o_trunc) open_in.flags &= ~O_TRUNC; if (fm->fc->handle_killpriv_v2 && - (open_in.flags & O_TRUNC) && !capable(CAP_FSETID)) { + (open_in.flags & O_TRUNC) && !capable(CAP_FSETID)) open_in.open_flags |= FUSE_OPEN_KILL_SUIDGID; - } - open_args.opcode = opcode; - open_args.nodeid = nodeid; - open_args.in_numargs = 1; - open_args.in_args[0].size = sizeof(open_in); - open_args.in_args[0].value = &open_in; - open_args.out_numargs = 1; - open_args.out_args[0].size = sizeof(struct fuse_open_out); - open_args.out_args[0].value = &open_out; + + fuse_open_args_fill(&open_args, nodeid, opcode, &open_in, outopenp); err = fuse_compound_add(compound, &open_args); if (err) goto out; - /* Add GETATTR */ - getattr_args.opcode = FUSE_GETATTR; - getattr_args.nodeid = nodeid; - getattr_args.in_numargs = 1; - getattr_args.in_args[0].size = sizeof(getattr_in); - getattr_args.in_args[0].value = &getattr_in; - getattr_args.out_numargs = 1; - getattr_args.out_args[0].size = sizeof(struct fuse_attr_out); - getattr_args.out_args[0].value = &attr_out; + fuse_getattr_args_fill(&getattr_args, nodeid, &getattr_in, outattrp); err = fuse_compound_add(compound, &getattr_args); if (err) @@ -181,14 +192,19 @@ static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, int fla if (err) goto out; - ff->fh = open_out.fh; - ff->open_flags = open_out.open_flags; + err = fuse_compound_get_error(compound, 0); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 1); + if (err) + goto out; - if (out_attr) - *out_attr = attr_out; + ff->fh = outopenp->fh; + ff->open_flags = outopenp->open_flags; out: - fuse_compound_free(compound); + kfree(compound); return err; } @@ -209,36 +225,41 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); if (open) { - struct fuse_open_out outarg; + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err = -ENOSYS; if (inode && fc->compound_open_getattr) { struct fuse_attr_out attr_outarg; + err = fuse_compound_open_getattr(fm, nodeid, open_flags, - opcode, ff, &attr_outarg); + opcode, ff, + &attr_outarg, outargp); + if (err == -ENOSYS) + fc->compound_open_getattr = 0; if (!err) - fuse_change_attributes(inode, &attr_outarg.attr, NULL, + fuse_change_attributes(inode, &attr_outarg.attr, + NULL, ATTR_TIMEOUT(&attr_outarg), fuse_get_attr_version(fc)); } if (err == -ENOSYS) { - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); - + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } } if (err) { - if(err != -ENOSYS) { + if (err != -ENOSYS) { /* err is not ENOSYS */ fuse_file_free(ff); return ERR_PTR(err); } else { /* No release needed */ - kfree(ff->release_args); - ff->release_args = NULL; + kfree(ff->args); + ff->args = NULL; /* we don't have open */ if (isdir) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4a841d27a9db8f..2607cd8d5165bb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -930,7 +930,7 @@ struct fuse_conn { /** Maximum stack depth for passthrough backing files */ int max_stack_depth; - + /* Does the filesystem support compound operations? */ unsigned int compound_open_getattr:1; @@ -1191,6 +1191,14 @@ struct fuse_io_args { void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); +/* + * Helper functions to initialize fuse_args for common operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg); +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg); struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); @@ -1296,7 +1304,6 @@ int fuse_compound_add(struct fuse_compound_req *compound, ssize_t fuse_compound_send(struct fuse_compound_req *compound); int fuse_compound_get_error(struct fuse_compound_req * compound, int op_idx); -void fuse_compound_free(struct fuse_compound_req *compound); /** * End a finished request @@ -1559,7 +1566,8 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); /* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct inode *inode, - unsigned int open_flags, bool isdir); + unsigned int open_flags, + bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); From d132097d53177af9bc3e311fcb5c138a319d1129 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 12 Dec 2025 14:13:10 +0100 Subject: [PATCH 32/47] RED-34640: Fix a startup teardown race There was a race between fuse_uring_cancel() and fuse_uring_register()/fuse_uring_next_fuse_req(), which comes from the queue reduction feature. Race was core-A core-B fuse_uring_register spin_lock(&queue->lock); fuse_uring_ent_avail() spin_unlock(&queue->lock); fuse_uring_cancel() spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; list_move() fuse_uring_next_fuse_req() spin_lock(&queue->lock); fuse_uring_ent_avail(ent, queue); fuse_uring_send_next_to_ring() spin_unlock(&queue->lock); fuse_uring_send_next_to_ring I.e. fuse_uring_ent_avail() was called two times and the 2nd time when the entry was actually already handled by fuse_uring_cancel(). Solution is to not call fuse_uring_ent_avail() from fuse_uring_register. With that the entry is not in state FRRS_AVAILABLE and fuse_uring_cancel() will not touch it. fuse_uring_send_next_to_ring() will mark it as FRRS_AVAILABLE, and then either assign a request to it and change state again or will not touch it at all anymore - race fixed. This will be folded into the upstream queue reduction patches and therefore has the RED-34640 commit message. Also entirely removed is fuse_uring_do_register() as remaining work can be done by the caller. Signed-off-by: Bernd Schubert (imported from commit 932febaee72bfc10a391cdfa14a2b7f37549d967) --- fs/fuse/dev_uring.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index ab5373548acc26..e7e267c4437d06 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1175,36 +1175,6 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -/* - * fuse_uring_req_fetch command handling - */ -static void fuse_uring_do_register(struct fuse_ring_ent *ent, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - struct fuse_ring_queue *queue = ent->queue; - struct fuse_ring *ring = queue->ring; - struct fuse_conn *fc = ring->fc; - struct fuse_iqueue *fiq = &fc->iq; - int node = cpu_to_node(queue->qid); - - if (WARN_ON_ONCE(node >= ring->nr_numa_nodes)) - node = 0; - - fuse_uring_prepare_cancel(cmd, issue_flags, ent); - - spin_lock(&queue->lock); - ent->cmd = cmd; - fuse_uring_ent_avail(ent, queue); - spin_unlock(&queue->lock); - - if (!ring->ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } -} - /* * Copy from memmap.c, should be exported there */ @@ -1385,6 +1355,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; + struct fuse_iqueue *fiq = &fc->iq; int err; unsigned int qid = READ_ONCE(cmd_req->qid); @@ -1416,8 +1387,18 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, if (IS_ERR(ent)) return PTR_ERR(ent); - fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + if (!ring->ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + + spin_lock(&queue->lock); + ent->cmd = cmd; + spin_unlock(&queue->lock); + /* Marks the ring entry as ready */ fuse_uring_next_fuse_req(ent, queue, issue_flags); return 0; From 23e29f8cf0bb61922b1b89b1c68774733ab88a55 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Oct 2025 23:17:15 +0200 Subject: [PATCH 33/47] fuse: Move ring queues_refs decrement This is just to avoid code dup with an upcoming commit. Signed-off-by: Bernd Schubert (imported from commit ec3217f655d816ac9e3e29b1dc1506d7b195a0a5) --- fs/fuse/dev_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index e7e267c4437d06..2bac062731b60f 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -470,7 +470,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) { struct fuse_req *req; struct io_uring_cmd *cmd; - + ssize_t queue_refs; struct fuse_ring_queue *queue = ent->queue; spin_lock(&queue->lock); @@ -498,15 +498,16 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) if (req) fuse_uring_stop_fuse_req_end(req); + + queue_refs = atomic_dec_return(&queue->ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); } static void fuse_uring_stop_list_entries(struct list_head *head, struct fuse_ring_queue *queue, enum fuse_ring_req_state exp_state) { - struct fuse_ring *ring = queue->ring; struct fuse_ring_ent *ent, *next; - ssize_t queue_refs = SSIZE_MAX; LIST_HEAD(to_teardown); spin_lock(&queue->lock); @@ -523,11 +524,8 @@ static void fuse_uring_stop_list_entries(struct list_head *head, spin_unlock(&queue->lock); /* no queue lock to avoid lock order issues */ - list_for_each_entry_safe(ent, next, &to_teardown, list) { + list_for_each_entry_safe(ent, next, &to_teardown, list) fuse_uring_entry_teardown(ent); - queue_refs = atomic_dec_return(&ring->queue_refs); - WARN_ON_ONCE(queue_refs < 0); - } } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) From 845125979f8c1be433eadecb7eb6d5b43438b38d Mon Sep 17 00:00:00 2001 From: Jian Huang Li Date: Mon, 20 Oct 2025 23:23:11 +0200 Subject: [PATCH 34/47] fs/fuse: fix potential memory leak from fuse_uring_cancel This issue could be observed sometimes during libfuse xfstests, from dmseg prints some like "kernel: WARNING: CPU: 4 PID: 0 at fs/fuse/dev_uring.c:204 fuse_uring_destruct+0x1f5/0x200 [fuse]". The cause is, if when fuse daemon just submitted FUSE_IO_URING_CMD_REGISTER SQEs, then umount or fuse daemon quits at this very early stage. After all uring queues stopped, might have one or more unprocessed FUSE_IO_URING_CMD_REGISTER SQEs get processed then some new ring entities are created and added to ent_avail_queue, and immediately fuse_uring_cancel moved them to ent_in_userspace after SQEs get canceled. These ring entities were not moved to ent_released, and stayed in ent_in_userspace when fuse_uring_destruct was called. One way to solve it would be to also free 'ent_in_userspace' in fuse_uring_destruct(), but from code point of view it is hard to see why it is needed. As suggested by Joanne, another solution is to avoid moving entries in fuse_uring_cancel() to the 'ent_in_userspace' list and just releasing them directly. Fixes: b6236c8407cb ("fuse: {io-uring} Prevent mount point hang on fuse-server termination") Cc: Joanne Koong Cc: # v6.14 Signed-off-by: Jian Huang Li Signed-off-by: Bernd Schubert (imported from commit 30d0473dcc0eecac6b1e00d9d87b0892146086a9) --- debian/scripts/misc/kconfig/__init__.py | 0 fs/fuse/dev_uring.c | 21 +++++++++------------ 2 files changed, 9 insertions(+), 12 deletions(-) delete mode 100644 debian/scripts/misc/kconfig/__init__.py diff --git a/debian/scripts/misc/kconfig/__init__.py b/debian/scripts/misc/kconfig/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 2bac062731b60f..5a70a87c1b3c07 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -466,7 +466,7 @@ static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) /* * Release a request/entry on connection tear down */ -static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent, int issue_flags) { struct fuse_req *req; struct io_uring_cmd *cmd; @@ -494,7 +494,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); if (req) fuse_uring_stop_fuse_req_end(req); @@ -525,7 +525,7 @@ static void fuse_uring_stop_list_entries(struct list_head *head, /* no queue lock to avoid lock order issues */ list_for_each_entry_safe(ent, next, &to_teardown, list) - fuse_uring_entry_teardown(ent); + fuse_uring_entry_teardown(ent, IO_URING_F_UNLOCKED); } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) @@ -651,7 +651,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, { struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue; - bool need_cmd_done = false; + bool teardown = false; /* * direct access on ent - it must not be destructed as long as @@ -660,17 +660,14 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, queue = ent->queue; spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - need_cmd_done = true; - ent->cmd = NULL; + ent->state = FRRS_TEARDOWN; + list_del_init(&ent->list); + teardown = true; } spin_unlock(&queue->lock); - if (need_cmd_done) { - /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); - } + if (teardown) + fuse_uring_entry_teardown(ent, issue_flags); } static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, From 4defc5204f1ae6d6f6456a77e78aa682837308d8 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 25 Nov 2025 10:13:47 -0800 Subject: [PATCH 35/47] fuse: fix io-uring list corruption for terminated non-committed requests When a request is terminated before it has been committed, the request is not removed from the queue's list. This leaves a dangling list entry that leads to list corruption and use-after-free issues. Remove the request from the queue's list for terminated non-committed requests. Signed-off-by: Joanne Koong Fixes: c090c8abae4b ("fuse: Add io-uring sqe commit and fetch support") Cc: stable@vger.kernel.org Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi (imported from commit 07214a0156daca7df97f0c25528259f76b5e2b10) --- fs/fuse/dev_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 5a70a87c1b3c07..f54e32cb57898d 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -106,6 +106,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, spin_lock(&queue->lock); ent->fuse_req = NULL; queue->nr_reqs--; + list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); From ae1ce136b43ede7056df95d9a7ed6870ecd19185 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sun, 23 Nov 2025 17:43:40 +0100 Subject: [PATCH 36/47] fuse: Fix missing numa_q_map free in dev_uring This fixes a memory leak. (imported from commit f75b62fce0e6689b1cc57bdae4b6a93be1ca2168) --- fs/fuse/dev_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f54e32cb57898d..4da7988f83e190 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -234,9 +234,11 @@ static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) fuse_ring_destruct_q_map(&ring->q_map); - if (ring->numa_q_map) + if (ring->numa_q_map) { for (node = 0; node < ring->nr_numa_nodes; node++) fuse_ring_destruct_q_map(&ring->numa_q_map[node]); + kfree(ring->numa_q_map); + } } void fuse_uring_destruct(struct fuse_conn *fc) From a8f31aa9d47f13fa15f1e7796ca042ccc37e5049 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Aug 2025 13:10:44 +0200 Subject: [PATCH 37/47] fuse: allow synchronous FUSE_INIT FUSE_INIT has always been asynchronous with mount. That means that the server processed this request after the mount syscall returned. This means that FUSE_INIT can't supply the root inode's ID, hence it currently has a hardcoded value. There are other limitations such as not being able to perform getxattr during mount, which is needed by selinux. To remove these limitations allow server to process FUSE_INIT while initializing the in-core super block for the fuse filesystem. This can only be done if the server is prepared to handle this, so add FUSE_DEV_IOC_SYNC_INIT ioctl, which a) lets the server know whether this feature is supported, returning ENOTTY othewrwise. b) lets the kernel know to perform a synchronous initialization The implementation is slightly tricky, since fuse_dev/fuse_conn are set up only during super block creation. This is solved by setting the private data of the fuse device file to a special value ((struct fuse_dev *) 1) and waiting for this to be turned into a proper fuse_dev before commecing with operations on the device file. Signed-off-by: Miklos Szeredi (imported from commit dfb84c33079497bf27058b15780e1c7bba4c371b) --- fs/fuse/cuse.c | 3 +- fs/fuse/dev.c | 71 +++++++++++++++++++++++++++++---------- fs/fuse/dev_uring.c | 4 +-- fs/fuse/fuse_dev_i.h | 13 +++++-- fs/fuse/fuse_i.h | 5 ++- fs/fuse/inode.c | 51 ++++++++++++++++++++++------ include/uapi/linux/fuse.h | 1 + 7 files changed, 114 insertions(+), 34 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a806f..28c96961e85d1c 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb8d8551193dd5..ab802fd544876c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1542,14 +1542,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1569,8 +1589,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2243,7 +2263,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2265,11 +2285,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2355,7 +2374,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2541,7 +2560,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2572,8 +2591,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2583,7 +2602,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2614,7 +2633,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2632,8 +2651,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2649,8 +2668,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2661,6 +2680,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2676,6 +2708,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2684,7 +2719,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 4da7988f83e190..88ddf96c72d8dd 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1423,9 +1423,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index fab53a1cc37ed5..4037fd7bdeee66 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -39,15 +41,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2607cd8d5165bb..fa554029fa8cf6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -925,6 +925,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1372,7 +1375,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bb778cd17d46f8..82dfb8e6f4ff3b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -8,6 +8,7 @@ #include "fuse_i.h" #include "fuse_dlm_cache.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include @@ -35,6 +36,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -1546,7 +1548,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1605,10 +1607,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1949,8 +1971,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1958,8 +1984,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1980,6 +2008,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -2000,8 +2029,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -2062,7 +2093,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d22a0e3b9c0e39..3cb85385cb87f5 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1152,6 +1152,7 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3) struct fuse_lseek_in { uint64_t fh; From cbd27323b8388721815f4aa73eec539910f8da16 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Fri, 19 Dec 2025 10:04:50 +0100 Subject: [PATCH 38/47] fuse: compounds fix includes no functional changes Signed-off-by: Horst Birthelmer (imported from commit f0bccb2ea093d8bf703d535d34541b3000ec1d86) --- fs/fuse/compound.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c index bc52e22eff3123..5d84e3558a06f8 100644 --- a/fs/fuse/compound.c +++ b/fs/fuse/compound.c @@ -10,18 +10,6 @@ #include "fuse_i.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - /* * Compound request builder and state tracker and args pointer storage */ From bf04ff5f0395c36f59800b3efddf36d2e2e18c97 Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Tue, 30 Sep 2025 03:00:46 +0800 Subject: [PATCH 39/47] Create workflow the create pr for redfs in each branch Take actions on the PR merged event of this repo. Run copy-from-linux-branch.sh and create a PR for redfs. (cherry picked from commit f54872e99c6ebccc92c202e15c22eb68c26b10f6) (imported from commit 522fddfe975a361a411b853eb6b40c62e35ad39e) --- .github/workflows/create-redfs-pr.yml | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 .github/workflows/create-redfs-pr.yml diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml new file mode 100644 index 00000000000000..cd7e9717440c4e --- /dev/null +++ b/.github/workflows/create-redfs-pr.yml @@ -0,0 +1,92 @@ +# Automatially run copy-from-linux-branch.sh on branches and create PR for redfs. +name: Sync to redfs repo +on: + # Triggers the workflow on pull request merged. + pull_request: + branches: [ "*" ] + types: [ "closed" ] + +jobs: + create-redfs-pr: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + # Checks-out to a different directory to avoid following checkout removing it. + - uses: actions/checkout@v4 + with: + path: linux + + - name: Try to checkout sync-${{ github.ref_name }} if it exists + uses: actions/checkout@v4 + id: try-checkout + continue-on-error: true + with: + repository: DDNStorage/redfs + ref: sync-${{ github.ref_name }} + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Fallback to checkout main + if: steps.try-checkout.outcome == 'failure' + uses: actions/checkout@v4 + with: + repository: DDNStorage/redfs + ref: main + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Initialize git + run: | + git config --global user.name "DDNStorage RED Workflow" + git config --global user.email "red@ddn.com" + + - name: Create tracking branch based on main + if: steps.try-checkout.outcome == 'failure' + run: | + pushd redfs + git checkout -b sync-${{ github.ref_name }} + popd + + - name: Generate PR for redfs + run: | + declare -A MAP + MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.26.1.el9_6" + MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" + kerver=${MAP["${{ github.ref_name }}"]} + if [ -z ${kerver} ]; then + echo "Cannot find target kernel version" + exit 1 + fi + pushd redfs + ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} + git add src/$kerver + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} \n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg + echo -e "${{ github.sha }}" >> ../commit.msg + RET=0 + git commit -F ../commit.msg 2> ../commit.log || RET=$?; + if [ -s ../commit.log ]; then + echo "Error detcted in commit:" + cat ../commit.log + exit 1 + elif [ $RET -eq 0 ]; then + echo "Done. Push the code to remote:" + git push origin sync-${{ github.ref_name }} 2> ../push.log ||: + else + echo "No changes to existed codes. Still try with PR." + fi + if [ -s ../push.log ]; then + echo "Error detected in push:" + cat ../push.log + fi + gh pr create --base main --fill || RET=$? + if [ $RET -eq 1 ]; then + echo "No pending changes for PR, returning $RET." + fi + popd + env: + GH_TOKEN: ${{ secrets.OPENUNIXPAT }} + From 99d238205a0ab85c75660b74dbbc1b010a21ca34 Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Tue, 30 Dec 2025 08:56:41 +0800 Subject: [PATCH 40/47] Fix the github actions PR trigger Switch to pull_request_target instead of pull_request as the github security requirement. Also limits the scope to protected PR. (cherry picked from commit b9980ad9af3598d465c72fb92f565415c8d4a006) (imported from commit e504e4a44abfa9cef941189e229cef0412c3f014) --- .github/workflows/create-redfs-pr.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml index cd7e9717440c4e..1f7b99a60c7aff 100644 --- a/.github/workflows/create-redfs-pr.yml +++ b/.github/workflows/create-redfs-pr.yml @@ -3,7 +3,10 @@ name: Sync to redfs repo on: # Triggers the workflow on pull request merged. pull_request: - branches: [ "*" ] + branches: [ "redfs-*" ] + types: [ "closed" ] + pull_request_target: + branches: [ "redfs-*" ] types: [ "closed" ] jobs: @@ -52,8 +55,9 @@ jobs: - name: Generate PR for redfs run: | declare -A MAP + MAP["redfs-rhel9_4-427.42.1"]="5.14.0-427.42.1.el9_4" MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" - MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.26.1.el9_6" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.12.1.el9_6" MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" kerver=${MAP["${{ github.ref_name }}"]} if [ -z ${kerver} ]; then @@ -63,7 +67,7 @@ jobs: pushd redfs ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} git add src/$kerver - echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} \n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }}\n" > ../commit.msg echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg echo -e "${{ github.sha }}" >> ../commit.msg RET=0 @@ -79,7 +83,7 @@ jobs: echo "No changes to existed codes. Still try with PR." fi if [ -s ../push.log ]; then - echo "Error detected in push:" + echo "Message detected in push:" cat ../push.log fi gh pr create --base main --fill || RET=$? @@ -88,5 +92,5 @@ jobs: fi popd env: - GH_TOKEN: ${{ secrets.OPENUNIXPAT }} + GH_TOKEN: ${{ secrets.REDFS_TOKEN }} From f6c808e8a27b3217db1c260285eb34eb0d363c56 Mon Sep 17 00:00:00 2001 From: Shuo Feng Date: Tue, 30 Dec 2025 11:18:10 +0800 Subject: [PATCH 41/47] Remove the pull_request_target from actions Remove the pull_request_target as it doesn't work. (cherry picked from commit 5328f660acf48ef3cf1f00ab8ae486aedf6874ee) (imported from commit 5277386783667357873cdd2819517b301a4b5063) --- .github/workflows/create-redfs-pr.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml index 1f7b99a60c7aff..cc03d7e1219e9b 100644 --- a/.github/workflows/create-redfs-pr.yml +++ b/.github/workflows/create-redfs-pr.yml @@ -5,9 +5,6 @@ on: pull_request: branches: [ "redfs-*" ] types: [ "closed" ] - pull_request_target: - branches: [ "redfs-*" ] - types: [ "closed" ] jobs: create-redfs-pr: From abffbbeff5cf19a1849b15d8666a4ea11f4807fb Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 13 Jan 2026 17:58:23 +0100 Subject: [PATCH 42/47] fuse: Make compounds a module option For now compounds are a module option and disabled by default Signed-off-by: Bernd Schubert (imported from commit f3b301ddccefec9e6363bb14e307c51462c0cc6a) --- fs/fuse/inode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 82dfb8e6f4ff3b..ada68233879740 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -33,6 +33,10 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); +static bool __read_mostly enable_compound; +module_param(enable_compound, bool, 0644); +MODULE_PARM_DESC(enable_uring, "Enable fuse compounds"); + static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); @@ -1039,10 +1043,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->connected = 1; fc->dlm = 1; - /* pretend fuse server supports compound operations - * until it tells us otherwise. - */ - fc->compound_open_getattr = 1; + /* module option for now */ + fc->compound_open_getattr = enable_compound; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); From 4ff6d6034ede857e03df89afa73fb92ca217b004 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 4 Feb 2026 18:47:37 +0100 Subject: [PATCH 43/47] fuse: Fix the reduced queue assignment The use of bitmap_weight() didn't give the actual index, but always returned the current cpu, which resulted in a totally wrong mapping. It now just increases a counter for every mapping and ignores cores not in the given (numa) map and then find the index for that. Also added is a pr_debug(), which can be activated for example with echo "module redfs +p" >/proc/dynamic_debug/control (Pity that upstream is not open for such debug messages). (imported from commit bcbb684ad26c86cc77c04fdab1584ff1ed6bc270) --- fs/fuse/dev_uring.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 88ddf96c72d8dd..1522d5114562c7 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -371,26 +371,25 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) } static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, - struct fuse_queue_map *q_map) + struct fuse_queue_map *q_map, + int node) { - int cpu, qid_idx; + int cpu, qid_idx, mapping_count = 0; size_t nr_queues; cpumask_set_cpu(qid, q_map->registered_q_mask); nr_queues = cpumask_weight(q_map->registered_q_mask); for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { - if (!q_map->cpu_to_qid) - return; - - /* - * Position of this CPU within the registered queue mask, - * handles non-contiguous CPU distributions across NUMA nodes. - */ - qid_idx = bitmap_weight( - cpumask_bits(q_map->registered_q_mask), cpu); + if (node != -1 && cpu_to_node(cpu) != node) + continue; - q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx % nr_queues, + qid_idx = mapping_count % nr_queues; + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx, q_map->registered_q_mask); + mapping_count++; + pr_debug("%s node=%d qid=%d qid_idx=%d nr_queues=%zu %d->%d\n", + __func__, node, qid, qid_idx, nr_queues, cpu, + q_map->cpu_to_qid[cpu]); } } @@ -441,7 +440,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, /* Static mapping from cpu to per numa queues */ node = cpu_to_node(qid); - fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node]); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node], node); /* * smp_store_release, as the variable is read without fc->lock and @@ -452,7 +451,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, ring->numa_q_map[node].nr_queues + 1); /* global mapping */ - fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map, -1); spin_unlock(&fc->lock); From 9bf3ed0e828f968941faff74581cde55f223ebb4 Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Fri, 26 Dec 2025 23:34:06 +0800 Subject: [PATCH 44/47] Fix the compiling error on aarch64 Fix the include sequence which causes a compiling error on aarch64. (imported from commit f5fed0e3f4ad6f98427baa53f5e7505df831dd81) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 17f9dac2377bbe..5e96dc72a63854 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -6,8 +6,8 @@ See the file COPYING. */ -#include "fuse_dlm_cache.h" #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include From d9dcafe2ecd6c580181d0ccb415d99ea8bff23b9 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 11 Feb 2026 16:38:21 +0100 Subject: [PATCH 45/47] fuse: {io-uring} Prefer the current core over mapping Mapping might point to a totally different core due to random assignment. For performance using the current core might be beneficial Example (with core binding) unpatched WRITE: bw=841MiB/s patched WRITE: bw=1363MiB/s With fio --name=test --ioengine=psync --direct=1 \ --rw=write --bs=1M --iodepth=1 --numjobs=1 \ --filename_format=/redfs/testfile.\$jobnum --size=100G \ --thread --create_on_open=1 --runtime=30s --cpus_allowed=1 In order to get the good number `--cpus_allowed=1` is needed. This could be improved by a future change that avoids cpu migration in fuse_request_end() on wake_up() call. (imported from commit 32e0073d67cfc7bd602dc7675ae71fa825b04362) --- fs/fuse/dev_uring.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 1522d5114562c7..97d7557a83554a 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -22,8 +22,12 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +/* Threshold that determines if a better queue should be searched for */ #define FUSE_URING_Q_THRESHOLD 2 +/* Number of (re)tries to find a better queue */ +#define FUSE_URING_Q_TRIES 3 + /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) @@ -1501,7 +1505,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node, retries = 0; + int node, tries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); struct fuse_ring_queue *queue, *primary_queue = NULL; @@ -1526,26 +1530,36 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); if (nr_queues) { + /* prefer the queue that corresponds to the current cpu */ + queue = READ_ONCE(ring->queues[cpu]); + if (queue) { + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + primary_queue = queue; + } + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - queue = READ_ONCE(ring->queues[qid]); + if (qid != cpu) { + queue = READ_ONCE(ring->queues[qid]); - /* Might happen on teardown */ - if (unlikely(!queue)) - return NULL; + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; - if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) - return queue; + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + } /* Retries help for load balancing */ - if (retries < FUSE_URING_Q_THRESHOLD) { - if (!retries) + if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) { + if (!primary_queue) primary_queue = queue; - /* Increase cpu, assuming it will map to a differet qid*/ + /* Increase cpu, assuming it will map to a different qid*/ cpu++; - retries++; + tries++; goto retry; } } @@ -1556,9 +1570,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; - if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) - /* Might happen on teardown */ + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) { + /* Might happen on teardown */ return NULL; + } return READ_ONCE(ring->queues[qid]); } From 43ca84c3fc9d14ddac3308ccf07168cb59e6d5e0 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Sun, 11 Jan 2026 15:37:01 +0800 Subject: [PATCH 46/47] fuse: invalidate the page cache after direct write This fixes xfstests generic/451 (for both O_DIRECT and FOPEN_DIRECT_IO direct write). Commit b359af8275a9 ("fuse: Invalidate the page cache after FOPEN_DIRECT_IO write") tries to fix the similar issue for FOPEN_DIRECT_IO write, which can be reproduced by xfstests generic/209. It only fixes the issue for synchronous direct write, while omitting the case for asynchronous direct write (exactly targeted by generic/451). While for O_DIRECT direct write, it's somewhat more complicated. For synchronous direct write, generic_file_direct_write() will invalidate the page cache after the write, and thus it can pass generic/209. While for asynchronous direct write, the invalidation in generic_file_direct_write() is bypassed since the invalidation shall be done when the asynchronous IO completes. This is omitted in FUSE and generic/451 fails whereby. Fix this by conveying the invalidation for both synchronous and asynchronous write. - with FOPEN_DIRECT_IO - sync write, invalidate in fuse_send_write() - async write, invalidate in fuse_aio_complete() with FUSE_ASYNC_DIO, fuse_send_write() otherwise - without FOPEN_DIRECT_IO - sync write, invalidate in generic_file_direct_write() - async write, invalidate in fuse_aio_complete() with FUSE_ASYNC_DIO, generic_file_direct_write() otherwise Reviewed-by: Bernd Schubert Signed-off-by: Jingbo Xu (imported from commit f6de786cb23689f0ee235a2dc75a991d30a90198) --- fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 373dec9381b2fa..a5d229f97800ea 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -762,6 +762,18 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; + + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + if (io->write && res && mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + io->offset >> PAGE_SHIFT, + (io->offset + res - 1) >> PAGE_SHIFT); + } spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); @@ -1185,9 +1197,11 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, { struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; + ssize_t written; ssize_t err; fuse_write_args_fill(ia, ff, pos, count); @@ -1201,10 +1215,26 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, return fuse_async_req_send(fm, ia, count); err = fuse_simple_request(fm, &ia->ap.args); - if (!err && ia->write.out.size > count) + written = ia->write.out.size; + if (!err && written > count) err = -EIO; - return err ?: ia->write.out.size; + /* + * Without FOPEN_DIRECT_IO, generic_file_direct_write() does the + * invalidation for us. + */ + if (!err && written && mapping->nrpages && + (ff->open_flags & FOPEN_DIRECT_IO)) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + (pos + written - 1) >> PAGE_SHIFT); + } + + return err ?: written; } bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) @@ -1814,15 +1844,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; - if (res > 0 && write && fopen_direct_io) { - /* - * As in generic_file_direct_write(), invalidate after the - * write, to invalidate read-ahead cache that may have competed - * with the write. - */ - invalidate_inode_pages2_range(mapping, idx_from, idx_to); - } - return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); From 69ecae4997fac02407bb77a418a4544920c11666 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 23 Feb 2026 16:58:19 +0100 Subject: [PATCH 47/47] fuse: enable large folios in inode initialization Signed-off-by: Horst Birthelmer --- fs/fuse/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a5d229f97800ea..f86f7ad1815ff6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3373,4 +3373,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); + + mapping_set_large_folios(inode->i_mapping); }