From d435ef38bc38428aa217b67d470750a3e586e10d Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Fri, 5 Jul 2024 18:04:49 +0800 Subject: [PATCH 1/9] fuse: make foffset alignment opt-in for optimum backend performance Sometimes the file offset alignment needs to be opt-in to achieve the optimum performance at the backend store. For example when ErasureCode [1] is used at the backend store, the optimum write performance is achieved when the WRITE request is aligned with the stripe size of ErasureCode. Otherwise a non-aligned WRITE request needs to be split at the stripe size boundary. It is quite costly to handle these split partial requests, as firstly the whole stripe to which the split partial request belongs needs to be read out, then overwrite the read stripe buffer with the request, and finally write the whole stripe back to the persistent storage. Thus the backend store can suffer severe performance degradation when WRITE requests can not fit into one stripe exactly. The write performance can be 10x slower when the request is 256KB in size given 4MB stripe size. Also there can be 50% performance degradation in theory if the request is not stripe boundary aligned. Besides, the conveyed test indicates that, the non-alignment issue becomes more severe when decreasing fuse's max_ratio, maybe partly because the background writeback now is more likely to run parallelly with the dirtier. fuse's max_ratio ratio of aligned WRITE requests ---------------- ------------------------------- 70 99.9% 40 74% 20 45% 10 20% With the patched version, which makes the alignment constraint opt-in when constructing WRITE requests, the ratio of aligned WRITE requests increases to 98% (previously 20%) when fuse's max_ratio is 10. fuse: fix alignment to work with redfs ubuntu - small fix to make the fuse alignment patch work with redfs ubuntu 6.8.x - add writeback_control to fuse_writepage_need_send() to make more accurate decisions about when to skip sending data - fix shift number for FUSE_ALIGN_PG_ORDER - remove test code [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4 Signed-off-by: Jingbo Xu Signed-off-by: Bernd Schubert Signed-off-by: Horst Birthelmer (cherry picked from commit 5e590a657460229e5cc8b05c5477a47955c96885) --- fs/fuse/file.c | 18 +++++++++++++++--- fs/fuse/fuse_i.h | 4 ++++ fs/fuse/inode.c | 8 ++++++++ include/uapi/linux/fuse.h | 10 +++++++++- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d3ddc5dd0d61fb..1a0c45c752f49b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2099,8 +2099,9 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, - struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) + struct fuse_args_pages *ap, + struct fuse_fill_wb_data *data, + struct writeback_control *wbc) { WARN_ON(!ap->num_pages); @@ -2120,6 +2121,17 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) return true; + /* Reached alignment */ + if (fc->alignment_pages && !(page->index % fc->alignment_pages)) { + /* we are at a point where we would write aligned + * check if we potentially could reach the next alignment */ + if (page->index + fc->alignment_pages > wbc->range_end) + return true; + + if (ap->num_pages + fc->alignment_pages > fc->max_pages) + return true; + } + return false; } @@ -2141,7 +2153,7 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data, wbc)) { fuse_writepages_send(data); data->wpa = NULL; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 436e2c05a7a9ad..71d51ee8fc961a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -931,6 +931,10 @@ struct fuse_conn { /** uring connection information*/ struct fuse_ring *ring; #endif + + /* The foffset alignment in PAGE */ + unsigned int alignment_pages; + }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0032cc94ed3aab..b59a5a325657cc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1429,6 +1429,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->direct_io_allow_mmap = 1; if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) fc->io_uring = 1; + + if (flags & FUSE_ALIGN_PG_ORDER) { + if (arg->align_page_order > 0) { + fc->alignment_pages = + (1UL << arg->align_page_order) + >> PAGE_SHIFT; + } + } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; if (flags & FUSE_INVAL_INODE_ENTRY) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 96a9fc4c395518..52ac7a3d266d46 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -429,6 +429,8 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation + * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for + * optimal io-size alignment * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free * to register between 1 and nr-core io-uring queues */ @@ -903,6 +905,9 @@ struct fuse_init_in { #define FUSE_COMPAT_INIT_OUT_SIZE 8 #define FUSE_COMPAT_22_INIT_OUT_SIZE 24 +/* + * align_page_order: Number of pages for optimal IO, or a multiple of that + */ struct fuse_init_out { uint32_t major; uint32_t minor; @@ -915,7 +920,10 @@ struct fuse_init_out { uint16_t max_pages; uint16_t map_alignment; uint32_t flags2; - uint32_t unused[7]; + uint32_t max_stack_depth; + uint8_t align_page_order; + uint8_t padding[3]; + uint32_t unused[5]; }; #define CUSE_INIT_INFO_MAX 4096 From 26bf669973f5d736d296f2565e51d1926e0590b6 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Thu, 4 Sep 2025 12:56:31 +0200 Subject: [PATCH 2/9] Revert "fuse: avoid tmp copying of data for writeback pages" This reverts commit 114c4df06d489bd0fc3bbc318073da76597401c4. (cherry picked from commit 461c4ed764169f0a9dbd314e29f1a5c690615443) --- fs/fuse/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++---- fs/fuse/fuse_i.h | 3 + 2 files changed, 331 insertions(+), 23 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1a0c45c752f49b..e58d74d319d8aa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -529,20 +529,83 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; + struct rb_node writepages_entry; struct list_head queue_entry; + struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; +static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, + pgoff_t idx_from, pgoff_t idx_to) +{ + struct rb_node *n; + + n = fi->writepages.rb_node; + + while (n) { + struct fuse_writepage_args *wpa; + pgoff_t curr_index; + + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); + WARN_ON(get_fuse_inode(wpa->inode) != fi); + curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else + return wpa; + } + return NULL; +} + +/* + * Check if any page in a range is under writeback + */ +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + bool found; + + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + + spin_lock(&fi->lock); + found = fuse_find_writeback(fi, idx_from, idx_to); + spin_unlock(&fi->lock); + + return found; +} + +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + return fuse_range_is_writeback(inode, index, index); +} + +/* + * Wait for page writeback to be completed. + * + * Since fuse doesn't rely on the VM writeback tracking, this has to + * use some other means. + */ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { - struct page *page = find_get_page(inode->i_mapping, index); - if (page) { - wait_on_page_writeback(page); - put_page(page); - } + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ static void fuse_sync_writes(struct inode *inode) { fuse_set_nowrite(inode); @@ -568,6 +631,10 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; + inode_lock(inode); + fuse_sync_writes(inode); + inode_unlock(inode); + err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -910,6 +977,13 @@ static int fuse_do_readpage(struct file *file, struct page *page) ssize_t res; u64 attr_ver; + /* + * Page writeback can extend beyond the lifetime of the + * page-cache page, so make sure we read a properly synced + * page. + */ + fuse_wait_on_page_writeback(inode, page->index); + attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -1609,7 +1683,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && filemap_range_has_writeback(mapping, pos, pos + count - 1)) { + if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1782,10 +1856,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; + int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); + if (wpa->ia.ff) fuse_file_put(wpa->ia.ff, false); @@ -1802,12 +1880,11 @@ static void fuse_writepage_finish(struct fuse_mount *fm, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - end_page_writeback(ap->pages[i]); - wb_writeout_inc(&bdi->wb); - } - + for (i = 0; i < ap->num_pages; i++) { + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1817,6 +1894,7 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { + struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; @@ -1852,8 +1930,23 @@ __acquires(fi->lock) out_free: fi->writectr--; + rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); + + /* After rb_erase() aux request list is private */ + for (aux = wpa->next; aux; aux = next) { + struct backing_dev_info *bdi = inode_to_bdi(aux->inode); + + next = aux->next; + aux->next = NULL; + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); + fuse_writepage_free(aux); + } + fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1881,6 +1974,43 @@ __acquires(fi->lock) } } +static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, + struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return curr; + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); + return NULL; +} + +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + WARN_ON(fuse_insert_writeback(root, wpa)); +} + static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -1900,6 +2030,42 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); + rb_erase(&wpa->writepages_entry, &fi->writepages); + while (wpa->next) { + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_writepage_args *next = wpa->next; + + wpa->next = next->next; + next->next = NULL; + next->ia.ff = fuse_file_get(wpa->ia.ff); + tree_insert(&fi->writepages, next); + + /* + * Skip fuse_flush_writepages() to make it easy to crop requests + * based on primary request size. + * + * 1st case (trivial): there are no concurrent activities using + * fuse_set/release_nowrite. Then we're on safe side because + * fuse_flush_writepages() would call fuse_send_writepage() + * anyway. + * + * 2nd case: someone called fuse_set_nowrite and it is waiting + * now for completion of all in-flight requests. This happens + * rarely and no more than once per page, so this should be + * okay. + * + * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle + * of fuse_set_nowrite..fuse_release_nowrite section. The fact + * that fuse_set_nowrite returned implies that all in-flight + * requests were completed along with all of their secondary + * requests. Further primary requests are blocked by negative + * writectr. Hence there cannot be any in-flight requests and + * no invocations of fuse_writepage_end() while we're in + * fuse_set_nowrite..fuse_release_nowrite section. + */ + fuse_send_writepage(fm, next, inarg->offset + inarg->size); + } fi->writectr--; fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); @@ -1993,7 +2159,8 @@ static int fuse_writepage_locked(struct page *page) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - int error = -EIO; + struct page *tmp_page; + int error = -ENOMEM; set_page_writeback(page); @@ -2002,32 +2169,44 @@ static int fuse_writepage_locked(struct page *page) goto err; ap = &wpa->ia.ap; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + + error = -EIO; wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) - goto err_free; + goto err_nofile; fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + copy_highpage(tmp_page, page); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - + wpa->next = NULL; ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = page; + ap->pages[0] = tmp_page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); + end_page_writeback(page); + return 0; +err_nofile: + __free_page(tmp_page); err_free: kfree(wpa); err: @@ -2041,6 +2220,19 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; + if (fuse_page_is_writeback(page->mapping->host, page->index)) { + /* + * ->writepages() should be called for sync() and friends. We + * should only get here on direct reclaim and then we are + * allowed to skip a page which is already in flight + */ + WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) return AOP_WRITEPAGE_ACTIVATE; @@ -2055,6 +2247,7 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; + struct page **orig_pages; unsigned int max_pages; }; @@ -2089,14 +2282,74 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); + int num_pages = wpa->ia.ap.num_pages; + int i; wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); + + for (i = 0; i < num_pages; i++) + end_page_writeback(data->orig_pages[i]); } +/* + * Check under fi->lock if the page is under writeback, and insert it onto the + * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's + * one already added for a page at this offset. If there's none, then insert + * this new request onto the auxiliary list, otherwise reuse the existing one by + * swapping the new temp page with the old one. + */ +static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, + struct page *page) +{ + struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); + struct fuse_writepage_args *tmp; + struct fuse_writepage_args *old_wpa; + struct fuse_args_pages *new_ap = &new_wpa->ia.ap; + + WARN_ON(new_ap->num_pages != 0); + new_ap->num_pages = 1; + + spin_lock(&fi->lock); + old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); + if (!old_wpa) { + spin_unlock(&fi->lock); + return true; + } + + for (tmp = old_wpa->next; tmp; tmp = tmp->next) { + pgoff_t curr_index; + + WARN_ON(tmp->inode != new_wpa->inode); + curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; + if (curr_index == page->index) { + WARN_ON(tmp->ia.ap.num_pages != 1); + swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + break; + } + } + + if (!tmp) { + new_wpa->next = old_wpa->next; + old_wpa->next = new_wpa; + } + + spin_unlock(&fi->lock); + + if (tmp) { + struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); + fuse_writepage_free(new_wpa); + } + + return false; +} static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, struct fuse_args_pages *ap, @@ -2105,6 +2358,15 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, { WARN_ON(!ap->num_pages); + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + if (fuse_page_is_writeback(data->inode, page->index)) + return true; + /* Reached max pages */ if (ap->num_pages == fc->max_pages) return true; @@ -2114,7 +2376,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return true; /* Discontinuity */ - if (ap->pages[ap->num_pages - 1]->index + 1 != page->index) + if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2144,6 +2406,7 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); + struct page *tmp_page; int err; if (!data->ff) { @@ -2158,11 +2421,31 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } + err = -ENOMEM; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto out_unlock; + + /* + * The page must not be redirtied until the writeout is completed + * (i.e. userspace has sent a reply to the write request). Otherwise + * there could be more than one temporary page instance for each real + * page. + * + * This is ensured by holding the page lock in page_mkwrite() while + * checking fuse_page_is_writeback(). We already hold the page lock + * since clear_page_dirty_for_io() and keep it held until we add the + * request to the fi->writepages list and increment ap->num_pages. + * After this fuse_page_is_writeback() will indicate that the page is + * under writeback, so we can release the page lock. + */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_alloc(); - if (!wpa) + if (!wpa) { + __free_page(tmp_page); goto out_unlock; + } fuse_writepage_add_to_bucket(fc, wpa); data->max_pages = 1; @@ -2170,23 +2453,36 @@ static int fuse_writepages_fill(struct folio *folio, ap = &wpa->ia.ap; fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->inode = inode; - wpa->ia.ff = data->ff; + wpa->next = NULL; ap->args.in_pages = true; ap->args.end = fuse_writepage_end; ap->num_pages = 0; + wpa->inode = inode; } folio_start_writeback(folio); + + copy_highpage(tmp_page, &folio->page); + ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->pages[ap->num_pages] = &folio->page; - ap->num_pages++; + data->orig_pages[ap->num_pages] = &folio->page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (!data->wpa) { + if (data->wpa) { + /* + * Protected by fi->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fi->lock); + ap->num_pages++; + spin_unlock(&fi->lock); + } else if (fuse_writepage_add(wpa, &folio->page)) { data->wpa = wpa; + } else { + folio_end_writeback(folio); } out_unlock: folio_unlock(folio); @@ -2214,6 +2510,13 @@ static int fuse_writepages(struct address_space *mapping, data.wpa = NULL; data.ff = NULL; + err = -ENOMEM; + data.orig_pages = kcalloc(fc->max_pages, + sizeof(struct page *), + GFP_NOFS); + if (!data.orig_pages) + goto out; + err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { WARN_ON(!data.wpa->ia.ap.num_pages); @@ -2222,6 +2525,7 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); + kfree(data.orig_pages); out: return err; } @@ -3244,6 +3548,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); + fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 71d51ee8fc961a..8d6cedcaa18a0d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -152,6 +152,9 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ + struct rb_root writepages; + /* dlm locked areas we have sent lock requests for */ struct fuse_dlm_cache dlm_locked_areas; }; From d169da38f6f6385c2366da980d8287365beb354e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Feb 2024 18:29:37 +0000 Subject: [PATCH 3/9] fuse: Remove fuse_writepage The writepage operation is deprecated as it leads to worse performance under high memory pressure due to folios being written out in LRU order rather than sequentially within a file. Use filemap_migrate_folio() to support dirty folio migration instead of writepage. Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Miklos Szeredi (cherry picked from commit ade0d22c97fd8bfd30aedaad5445d1d8b2f38eba) --- fs/fuse/file.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e58d74d319d8aa..18615dccd20a57 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2215,34 +2215,6 @@ static int fuse_writepage_locked(struct page *page) return error; } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) -{ - struct fuse_conn *fc = get_fuse_conn(page->mapping->host); - int err; - - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return AOP_WRITEPAGE_ACTIVATE; - - err = fuse_writepage_locked(page); - unlock_page(page); - - return err; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; @@ -3524,10 +3496,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, - .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, From 4707cdb633a961ed3275b09bcf804ab9c737bc9d Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 7 Jul 2025 14:28:41 +0200 Subject: [PATCH 4/9] fuse: avoid tmp copying of data for writeback pages When writing back pages while using writeback caching the code did a copy of data into temporary pages to avoid a deadlock in reclaiming of memory. This is an adaptation and backport of a patch by Joanne Koong joannelkoong@gmail.com. Since we use pinned memory with io_uring we don't need the temporary copies and we don't use the AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM flag in the pagemap. Link: https://www.spinics.net/lists/linux-mm/msg407405.html Signed-off-by: Horst Birthelmer (cherry picked from commit f18c61e762d6f51ab95c3def560854155ccee04e) --- fs/fuse/file.c | 350 +++++------------------------------------------ fs/fuse/fuse_i.h | 9 +- 2 files changed, 40 insertions(+), 319 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 18615dccd20a57..aa928d09044ea1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -529,83 +529,20 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); + struct page *page = find_get_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + put_page(page); + } } -/* - * Wait for all pending writepages on the inode to finish. - * - * This is currently done by blocking further writes with FUSE_NOWRITE - * and waiting for all sent writes to complete. - * - * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage - * could conflict with truncation. - */ static void fuse_sync_writes(struct inode *inode) { fuse_set_nowrite(inode); @@ -631,10 +568,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -977,13 +910,6 @@ static int fuse_do_readpage(struct file *file, struct page *page) ssize_t res; u64 attr_ver; - /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. - */ - fuse_wait_on_page_writeback(inode, page->index); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -1683,7 +1609,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, pos + count - 1)) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1856,14 +1782,10 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); - if (wpa->ia.ff) fuse_file_put(wpa->ia.ff, false); @@ -1880,11 +1802,12 @@ static void fuse_writepage_finish(struct fuse_mount *fm, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_pages; i++) { + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + end_page_writeback(ap->pages[i]); + wb_writeout_inc(&bdi->wb); + } + wake_up(&fi->page_waitq); } @@ -1894,7 +1817,6 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; @@ -1930,23 +1852,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - struct backing_dev_info *bdi = inode_to_bdi(aux->inode); - - next = aux->next; - aux->next = NULL; - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1974,43 +1881,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_pages); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_pages) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2030,42 +1900,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - next->ia.ff = fuse_file_get(wpa->ia.ff); - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); @@ -2159,8 +1993,7 @@ static int fuse_writepage_locked(struct page *page) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; - int error = -ENOMEM; + int error = -EIO; set_page_writeback(page); @@ -2169,44 +2002,32 @@ static int fuse_writepage_locked(struct page *page) goto err; ap = &wpa->ia.ap; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto err_free; - - error = -EIO; wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) - goto err_nofile; + goto err_free; fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); - copy_highpage(tmp_page, page); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; + ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = tmp_page; + ap->pages[0] = page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); - return 0; -err_nofile: - __free_page(tmp_page); err_free: kfree(wpa); err: @@ -2219,7 +2040,6 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; unsigned int max_pages; }; @@ -2254,74 +2074,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; - int i; wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); } -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - fuse_writepage_free(new_wpa); - } - - return false; -} static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, struct fuse_args_pages *ap, @@ -2330,15 +2090,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, { WARN_ON(!ap->num_pages); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_page_is_writeback(data->inode, page->index)) - return true; - /* Reached max pages */ if (ap->num_pages == fc->max_pages) return true; @@ -2348,7 +2099,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (ap->pages[ap->num_pages - 1]->index + 1 != page->index) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2378,7 +2129,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; int err; if (!data->ff) { @@ -2393,31 +2143,11 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } - err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_alloc(); - if (!wpa) { - __free_page(tmp_page); + if (!wpa) goto out_unlock; - } fuse_writepage_add_to_bucket(fc, wpa); data->max_pages = 1; @@ -2425,36 +2155,23 @@ static int fuse_writepages_fill(struct folio *folio, ap = &wpa->ia.ap; fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; + wpa->inode = inode; + wpa->ia.ff = data->ff; ap->args.in_pages = true; ap->args.end = fuse_writepage_end; ap->num_pages = 0; - wpa->inode = inode; } folio_start_writeback(folio); - - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = &folio->page; + ap->pages[ap->num_pages] = &folio->page; + ap->num_pages++; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_pages++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + if (!data->wpa) { data->wpa = wpa; - } else { - folio_end_writeback(folio); } out_unlock: folio_unlock(folio); @@ -2482,13 +2199,6 @@ static int fuse_writepages(struct address_space *mapping, data.wpa = NULL; data.ff = NULL; - err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) - goto out; - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { WARN_ON(!data.wpa->ia.ap.num_pages); @@ -2497,7 +2207,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_pages); out: return err; } @@ -3472,6 +3181,16 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, return ret; } +#ifdef CONFIG_MIGRATION +int fuse_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { + + if (folio_test_writeback(src)) + return -EBUSY; + return filemap_migrate_folio(mapping, dst, src, mode); +} +#endif + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, @@ -3499,7 +3218,7 @@ static const struct address_space_operations fuse_file_aops = { .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, - .migrate_folio = filemap_migrate_folio, + .migrate_folio = fuse_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, @@ -3520,7 +3239,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8d6cedcaa18a0d..6fcc7cc03dae4d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -152,9 +152,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; - /* dlm locked areas we have sent lock requests for */ struct fuse_dlm_cache dlm_locked_areas; }; @@ -1480,6 +1477,12 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +#ifdef CONFIG_MIGRATION +int fuse_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode); +#else +#define fuse_migrate_folio NULL +#endif #ifdef CONFIG_MIGRATION int fuse_migrate_folio(struct address_space *mapping, struct folio *dst, From 7d4316423ecab7a9e74353806b66befcc85cc2ff Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 24 Sep 2025 19:14:19 +0200 Subject: [PATCH 5/9] fuse: {io-uring} Queue background requests on a different core Running background IO on a different core makes quite a difference. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=272MiB/s (285MB/s) ... patched READ: bw=650MiB/s (682MB/s) Reason is easily visible, the fio process is migrating between CPUs when requests are submitted on the queue for the same core. With --iodepth=8 unpatched READ: bw=466MiB/s (489MB/s) patched READ: bw=641MiB/s (672MB/s) Without io-uring (--iodepth=8) READ: bw=729MiB/s (764MB/s) Without fuse (--iodepth=8) READ: bw=2199MiB/s (2306MB/s) (Test were done with /example/passthrough_hp -o allow_other --nopassthrough \ [-o io_uring] /tmp/source /tmp/dest ) Additional notes: With FURING_NEXT_QUEUE_RETRIES=0 (--iodepth=8) READ: bw=903MiB/s (946MB/s) With just a random qid (--iodepth=8) READ: bw=429MiB/s (450MB/s) With --iodepth=1 unpatched READ: bw=195MiB/s (204MB/s) patched READ: bw=232MiB/s (243MB/s) With --iodepth=1 --numjobs=2 unpatched READ: bw=366MiB/s (384MB/s) patched READ: bw=472MiB/s (495MB/s) With --iodepth=1 --numjobs=8 unpatched READ: bw=1437MiB/s (1507MB/s) patched READ: bw=1529MiB/s (1603MB/s) fuse without io-uring READ: bw=1314MiB/s (1378MB/s), 1314MiB/s-1314MiB/s ... no-fuse READ: bw=2566MiB/s (2690MB/s), 2566MiB/s-2566MiB/s ... In summary, for async requests the core doing application IO is busy sending requests and processing IOs should be done on a different core. Spreading the load on random cores is also not desirable, as the core might be frequency scaled down and/or in C1 sleep states. Not shown here, but differnces are much smaller when the system uses performance govenor instead of schedutil (ubuntu default). Obviously at the cost of higher system power consumption for performance govenor - not desirable either. Results without io-uring (which uses fixed libfuse threads per queue) heavily depend on the current number of active threads. Libfuse uses default of max 10 threads, but actual nr max threads is a parameter. Also, no-fuse-io-uring results heavily depend on, if there was already running another workload before, as libfuse starts these threads dynamically - i.e. the more threads are active, the worse the performance. Signed-off-by: Bernd Schubert (cherry picked from commit c6399ea79b104ac79758f2c36f1977b80a02358d) --- fs/fuse/dev_uring.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index bf5e30fd7a0307..d7af39ceb322d3 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1456,13 +1456,21 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, + bool background) { unsigned int qid; int node; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + /* + * Background requests result in better performance on a different + * CPU, unless CPUs are already busy. + */ + if (background) + cpu++; + cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1525,7 +1533,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, false); if (!queue) goto err; @@ -1567,7 +1575,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, true); if (!queue) return false; From 3909d1434a73bd9312140804e18f47863037a66f Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 24 Oct 2025 19:05:07 +0200 Subject: [PATCH 6/9] fuse: Add retry attempts for numa local queues for load distribution This is to further improve performance. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=650MiB/s (682MB/s) patched: READ: bw=995MiB/s (1043MB/s) with --iodepth=8 unpatched READ: bw=641MiB/s (672MB/s) patched READ: bw=966MiB/s (1012MB/s) Reason is that with --iodepth=x (x > 1) fio submits multiple async requests and a single queue might become CPU limited. I.e. spreading the load helps. (cherry picked from commit 2e73b0be1f55d61c2d861a12bf6bb9963b9b877a) --- fs/fuse/dev_uring.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index d7af39ceb322d3..1bd12570c583f0 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -22,6 +22,8 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +#define FUSE_URING_Q_THRESHOLD 2 + /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) @@ -1460,9 +1462,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node; + int node, retries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + struct fuse_ring_queue *queue, *primary_queue = NULL; /* * Background requests result in better performance on a different @@ -1471,6 +1474,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, if (background) cpu++; +retry: cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1486,12 +1490,35 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - return READ_ONCE(ring->queues[qid]); + queue = READ_ONCE(ring->queues[qid]); + + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; + + if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) + return queue; + + /* Retries help for load balancing */ + if (retries < FUSE_URING_Q_THRESHOLD) { + if (!retries) + primary_queue = queue; + + /* Increase cpu, assuming it will map to a differet qid*/ + cpu++; + retries++; + goto retry; + } } + /* Retries exceeded, take the primary target queue */ + if (primary_queue) + return primary_queue; + /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + /* Might happen on teardown */ return NULL; return READ_ONCE(ring->queues[qid]); } From 53047892432d10222a8e18dd563e42b6f8238c85 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Thu, 8 Jan 2026 20:22:06 +0100 Subject: [PATCH 7/9] fuse: simplify compound commands Simplify fuse_compound_req to hold only the pointers to the added fuse args and the request housekeeping. Simplify open+getattr call by using helper functions to fill out the fuse request parameters Signed-off-by: Horst Birthelmer (cherry picked from commit 1607a03696693c4ceef7a61adf5759748a7ca9b0) Note: Empty, as there seem to be compound differences between branches - the el9.6 branch already has the changes from this patch. Keeping it will just simplify branch comparison with https://github.com/bsbernd/compare-git-branches From d91d055e070b8e10b9b3822e24e8aa9bee85d8fe Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Sun, 11 Jan 2026 15:37:01 +0800 Subject: [PATCH 8/9] fuse: invalidate the page cache after direct write This fixes xfstests generic/451 (for both O_DIRECT and FOPEN_DIRECT_IO direct write). Commit b359af8275a9 ("fuse: Invalidate the page cache after FOPEN_DIRECT_IO write") tries to fix the similar issue for FOPEN_DIRECT_IO write, which can be reproduced by xfstests generic/209. It only fixes the issue for synchronous direct write, while omitting the case for asynchronous direct write (exactly targeted by generic/451). While for O_DIRECT direct write, it's somewhat more complicated. For synchronous direct write, generic_file_direct_write() will invalidate the page cache after the write, and thus it can pass generic/209. While for asynchronous direct write, the invalidation in generic_file_direct_write() is bypassed since the invalidation shall be done when the asynchronous IO completes. This is omitted in FUSE and generic/451 fails whereby. Fix this by conveying the invalidation for both synchronous and asynchronous write. - with FOPEN_DIRECT_IO - sync write, invalidate in fuse_send_write() - async write, invalidate in fuse_aio_complete() with FUSE_ASYNC_DIO, fuse_send_write() otherwise - without FOPEN_DIRECT_IO - sync write, invalidate in generic_file_direct_write() - async write, invalidate in fuse_aio_complete() with FUSE_ASYNC_DIO, generic_file_direct_write() otherwise Reviewed-by: Bernd Schubert Signed-off-by: Jingbo Xu (cherry picked from commit f6de786cb23689f0ee235a2dc75a991d30a90198) --- fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aa928d09044ea1..9ce6e8328ecb1c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -758,6 +758,18 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; + + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + if (io->write && res && mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + io->offset >> PAGE_SHIFT, + (io->offset + res - 1) >> PAGE_SHIFT); + } spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); @@ -1132,9 +1144,11 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, { struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; + ssize_t written; ssize_t err; fuse_write_args_fill(ia, ff, pos, count); @@ -1148,10 +1162,26 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, return fuse_async_req_send(fm, ia, count); err = fuse_simple_request(fm, &ia->ap.args); - if (!err && ia->write.out.size > count) + written = ia->write.out.size; + if (!err && written > count) err = -EIO; - return err ?: ia->write.out.size; + /* + * Without FOPEN_DIRECT_IO, generic_file_direct_write() does the + * invalidation for us. + */ + if (!err && written && mapping->nrpages && + (ff->open_flags & FOPEN_DIRECT_IO)) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + (pos + written - 1) >> PAGE_SHIFT); + } + + return err ?: written; } bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) @@ -1676,15 +1706,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; - if (res > 0 && write && fopen_direct_io) { - /* - * As in generic_file_direct_write(), invalidate after the - * write, to invalidate read-ahead cache that may have competed - * with the write. - */ - invalidate_inode_pages2_range(mapping, idx_from, idx_to); - } - return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); From dc9fe285a05d8ebfb1caab87ec327c2b4278da92 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 11 Feb 2026 16:38:21 +0100 Subject: [PATCH 9/9] fuse: {io-uring} Prefer the current core over mapping Mapping might point to a totally different core due to random assignment. For performance using the current core might be beneficial Example (with core binding) unpatched WRITE: bw=841MiB/s patched WRITE: bw=1363MiB/s With fio --name=test --ioengine=psync --direct=1 \ --rw=write --bs=1M --iodepth=1 --numjobs=1 \ --filename_format=/redfs/testfile.\$jobnum --size=100G \ --thread --create_on_open=1 --runtime=30s --cpus_allowed=1 In order to get the good number `--cpus_allowed=1` is needed. This could be improved by a future change that avoids cpu migration in fuse_request_end() on wake_up() call. (cherry picked from commit 32e0073d67cfc7bd602dc7675ae71fa825b04362) --- fs/fuse/dev_uring.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 1bd12570c583f0..ec91a33627d7f8 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -22,8 +22,12 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +/* Threshold that determines if a better queue should be searched for */ #define FUSE_URING_Q_THRESHOLD 2 +/* Number of (re)tries to find a better queue */ +#define FUSE_URING_Q_TRIES 3 + /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) @@ -1462,7 +1466,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node, retries = 0; + int node, tries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); struct fuse_ring_queue *queue, *primary_queue = NULL; @@ -1487,26 +1491,36 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); if (nr_queues) { + /* prefer the queue that corresponds to the current cpu */ + queue = READ_ONCE(ring->queues[cpu]); + if (queue) { + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + primary_queue = queue; + } + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - queue = READ_ONCE(ring->queues[qid]); + if (qid != cpu) { + queue = READ_ONCE(ring->queues[qid]); - /* Might happen on teardown */ - if (unlikely(!queue)) - return NULL; + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; - if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) - return queue; + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + } /* Retries help for load balancing */ - if (retries < FUSE_URING_Q_THRESHOLD) { - if (!retries) + if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) { + if (!primary_queue) primary_queue = queue; - /* Increase cpu, assuming it will map to a differet qid*/ + /* Increase cpu, assuming it will map to a different qid*/ cpu++; - retries++; + tries++; goto retry; } } @@ -1517,9 +1531,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; - if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) - /* Might happen on teardown */ + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) { + /* Might happen on teardown */ return NULL; + } return READ_ONCE(ring->queues[qid]); }