diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml new file mode 100644 index 00000000000000..cc03d7e1219e9b --- /dev/null +++ b/.github/workflows/create-redfs-pr.yml @@ -0,0 +1,93 @@ +# Automatially run copy-from-linux-branch.sh on branches and create PR for redfs. +name: Sync to redfs repo +on: + # Triggers the workflow on pull request merged. + pull_request: + branches: [ "redfs-*" ] + types: [ "closed" ] + +jobs: + create-redfs-pr: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + # Checks-out to a different directory to avoid following checkout removing it. + - uses: actions/checkout@v4 + with: + path: linux + + - name: Try to checkout sync-${{ github.ref_name }} if it exists + uses: actions/checkout@v4 + id: try-checkout + continue-on-error: true + with: + repository: DDNStorage/redfs + ref: sync-${{ github.ref_name }} + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Fallback to checkout main + if: steps.try-checkout.outcome == 'failure' + uses: actions/checkout@v4 + with: + repository: DDNStorage/redfs + ref: main + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Initialize git + run: | + git config --global user.name "DDNStorage RED Workflow" + git config --global user.email "red@ddn.com" + + - name: Create tracking branch based on main + if: steps.try-checkout.outcome == 'failure' + run: | + pushd redfs + git checkout -b sync-${{ github.ref_name }} + popd + + - name: Generate PR for redfs + run: | + declare -A MAP + MAP["redfs-rhel9_4-427.42.1"]="5.14.0-427.42.1.el9_4" + MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.12.1.el9_6" + MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" + kerver=${MAP["${{ github.ref_name }}"]} + if [ -z ${kerver} ]; then + echo "Cannot find target kernel version" + exit 1 + fi + pushd redfs + ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} + git add src/$kerver + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }}\n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg + echo -e "${{ github.sha }}" >> ../commit.msg + RET=0 + git commit -F ../commit.msg 2> ../commit.log || RET=$?; + if [ -s ../commit.log ]; then + echo "Error detcted in commit:" + cat ../commit.log + exit 1 + elif [ $RET -eq 0 ]; then + echo "Done. Push the code to remote:" + git push origin sync-${{ github.ref_name }} 2> ../push.log ||: + else + echo "No changes to existed codes. Still try with PR." + fi + if [ -s ../push.log ]; then + echo "Message detected in push:" + cat ../push.log + fi + gh pr create --base main --fill || RET=$? + if [ $RET -eq 1 ]; then + echo "No pending changes for PR, returning $RET." + fi + popd + env: + GH_TOKEN: ${{ secrets.REDFS_TOKEN }} + diff --git a/debian/scripts/misc/kconfig/__init__.py b/debian/scripts/misc/kconfig/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3f0f312a31c1cc..f54c504ca6637c 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o compound.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c new file mode 100644 index 00000000000000..5d84e3558a06f8 --- /dev/null +++ b/fs/fuse/compound.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2025 + * + * This file implements compound operations for FUSE, allowing multiple + * operations to be batched into a single request to reduce round trips + * between kernel and userspace. + */ + +#include "fuse_i.h" + +/* + * Compound request builder and state tracker and args pointer storage + */ +struct fuse_compound_req { + struct fuse_mount *fm; + struct fuse_compound_in compound_header; + struct fuse_compound_out result_header; + + /* Per-operation error codes */ + int op_errors[FUSE_MAX_COMPOUND_OPS]; + struct fuse_args *op_args[FUSE_MAX_COMPOUND_OPS]; +}; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, u32 flags) +{ + struct fuse_compound_req *compound; + + compound = kzalloc(sizeof(*compound), GFP_KERNEL); + if (!compound) + return ERR_PTR(-ENOMEM); + + compound->fm = fm; + compound->compound_header.flags = flags; + + return compound; +} + +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args) +{ + if (!compound || + compound->compound_header.count >= FUSE_MAX_COMPOUND_OPS) + return -EINVAL; + + if (args->in_pages) + return -EINVAL; + + compound->op_args[compound->compound_header.count] = args; + compound->compound_header.count++; + return 0; +} + +static void *fuse_copy_response_per_req(struct fuse_args *args, + char *resp) +{ + int i; + size_t copied = 0; + + for (i = 0; i < args->out_numargs; i++) { + struct fuse_arg current_arg = args->out_args[i]; + size_t arg_size = current_arg.size; + + if (current_arg.value && arg_size > 0) { + memcpy(current_arg.value, + (char *)resp + copied, arg_size); + copied += arg_size; + } + } + + return (char *)resp + copied; +} + +int fuse_compound_get_error(struct fuse_compound_req *compound, int op_idx) +{ + return compound->op_errors[op_idx]; +} + +static void *fuse_compound_parse_one_op(struct fuse_compound_req *compound, + int op_index, void *op_out_data, + void *response_end) +{ + struct fuse_out_header *op_hdr = op_out_data; + struct fuse_args *args = compound->op_args[op_index]; + + if (op_hdr->len < sizeof(struct fuse_out_header)) + return NULL; + + /* Check if the entire operation response fits in the buffer */ + if ((char *)op_out_data + op_hdr->len > (char *)response_end) + return NULL; + + if (op_hdr->error != 0) + compound->op_errors[op_index] = op_hdr->error; + + if (args && op_hdr->len > sizeof(struct fuse_out_header)) + return fuse_copy_response_per_req(args, op_out_data + + sizeof(struct fuse_out_header)); + + /* No response data, just advance past the header */ + return (char *)op_out_data + op_hdr->len; +} + +static int fuse_compound_parse_resp(struct fuse_compound_req *compound, + u32 count, void *response, + size_t response_size) +{ + void *op_out_data = response; + void *response_end = (char *)response + response_size; + int i; + + if (!response || response_size < sizeof(struct fuse_out_header)) + return -EIO; + + for (i = 0; i < count && i < compound->result_header.count; i++) { + op_out_data = fuse_compound_parse_one_op(compound, i, + op_out_data, + response_end); + if (!op_out_data) + return -EIO; + } + + return 0; +} + +ssize_t fuse_compound_send(struct fuse_compound_req *compound) +{ + struct fuse_args args = { + .opcode = FUSE_COMPOUND, + .nodeid = 0, + .in_numargs = 2, + .out_numargs = 2, + .out_argvar = true, + }; + size_t resp_buffer_size; + size_t actual_response_size; + size_t buffer_pos; + size_t total_expected_out_size; + void *buffer = NULL; + void *resp_payload; + ssize_t ret; + int i; + + if (!compound) { + pr_info_ratelimited("FUSE: compound request is NULL in %s\n", + __func__); + return -EINVAL; + } + + if (compound->compound_header.count == 0) { + pr_info_ratelimited("FUSE: compound request contains no operations\n"); + return -EINVAL; + } + + buffer_pos = 0; + total_expected_out_size = 0; + + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + buffer_pos += needed_size; + + for (j = 0; j < op_args->out_numargs; j++) + total_expected_out_size += op_args->out_args[j].size; + } + + buffer = kvmalloc(buffer_pos, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + buffer_pos = 0; + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + struct fuse_in_header *hdr; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + hdr = (struct fuse_in_header *)(buffer + buffer_pos); + memset(hdr, 0, sizeof(*hdr)); + hdr->len = needed_size; + hdr->opcode = op_args->opcode; + hdr->nodeid = op_args->nodeid; + hdr->uid = from_kuid(compound->fm->fc->user_ns, + current_fsuid()); + hdr->gid = from_kgid(compound->fm->fc->user_ns, + current_fsgid()); + hdr->pid = pid_nr_ns(task_pid(current), + compound->fm->fc->pid_ns); + buffer_pos += sizeof(*hdr); + + for (j = 0; j < op_args->in_numargs; j++) { + memcpy(buffer + buffer_pos, op_args->in_args[j].value, + op_args->in_args[j].size); + buffer_pos += op_args->in_args[j].size; + } + } + + resp_buffer_size = total_expected_out_size + + (compound->compound_header.count * + sizeof(struct fuse_out_header)); + + resp_payload = kvmalloc(resp_buffer_size, GFP_KERNEL | __GFP_ZERO); + if (!resp_payload) { + ret = -ENOMEM; + goto out_free_buffer; + } + + compound->compound_header.result_size = total_expected_out_size; + + args.in_args[0].size = sizeof(compound->compound_header); + args.in_args[0].value = &compound->compound_header; + args.in_args[1].size = buffer_pos; + args.in_args[1].value = buffer; + + args.out_args[0].size = sizeof(compound->result_header); + args.out_args[0].value = &compound->result_header; + args.out_args[1].size = resp_buffer_size; + args.out_args[1].value = resp_payload; + + ret = fuse_simple_request(compound->fm, &args); + if (ret < 0) + goto out; + + actual_response_size = args.out_args[1].size; + + if (actual_response_size < sizeof(struct fuse_compound_out)) { + pr_info_ratelimited("FUSE: compound response too small (%zu bytes, minimum %zu bytes)\n", + actual_response_size, + sizeof(struct fuse_compound_out)); + ret = -EINVAL; + goto out; + } + + ret = fuse_compound_parse_resp(compound, compound->result_header.count, + (char *)resp_payload, + actual_response_size); +out: + kvfree(resp_payload); +out_free_buffer: + kvfree(buffer); + return ret; +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a806f..28c96961e85d1c 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 612d4da6d7d914..ab802fd544876c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -374,8 +375,6 @@ static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -398,7 +397,9 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); + + /* enqueue, as it is send to "fiq->ops queue" */ + trace_fuse_request_enqueue(req); fiq->ops->send_req(fiq, req); } @@ -639,6 +640,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; if (args->end) __set_bit(FR_ASYNC, &req->flags); + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(&req->fm->fc->iq); } ssize_t __fuse_simple_request(struct mnt_idmap *idmap, @@ -686,9 +690,6 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap, static bool fuse_request_queue_background_uring(struct fuse_conn *fc, struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; - - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); @@ -713,6 +714,8 @@ static int fuse_request_queue_background(struct fuse_req *req) } __set_bit(FR_ISREPLY, &req->flags); + trace_fuse_request_bg_enqueue(req); + #ifdef CONFIG_FUSE_IO_URING if (fuse_uring_ready(fc)) return fuse_request_queue_background_uring(fc, req); @@ -827,7 +830,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write, } /* Unmap and put previous page of userspace buffer */ -static void fuse_copy_finish(struct fuse_copy_state *cs) +void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; @@ -893,6 +896,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs++; } + } else if (cs->ring.pages) { + cs->pg = cs->ring.pages[cs->ring.page_idx++]; + /* + * non stricly needed, just to avoid a uring exception in + * fuse_copy_finish + */ + get_page(cs->pg); + cs->len = PAGE_SIZE; + cs->offset = 0; } else { size_t off; err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); @@ -1448,6 +1460,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); spin_unlock(&fiq->lock); + trace_fuse_request_send(req); args = req->args; reqsize = req->in.h.len; @@ -1529,14 +1542,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1556,8 +1589,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2230,7 +2263,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2252,11 +2285,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2342,7 +2374,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2386,6 +2418,45 @@ static void end_polls(struct fuse_conn *fc) } } +/* + * Flush all pending requests and wait for them. Only call this function when + * it is no longer possible for other threads to add requests. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) +{ + unsigned long deadline; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + /* Push all the background requests to the queue. */ + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + spin_unlock(&fc->lock); + + fuse_uring_flush_bg(fc); + + /* + * Wait 30s for all the events to complete or abort. Touch the + * watchdog once per second so that we don't trip the hangcheck timer + * while waiting for the fuse server. + */ + deadline = jiffies + timeout; + smp_mb(); + while (fc->connected && + (!timeout || time_before(jiffies, deadline)) && + wait_event_timeout(fc->blocked_waitq, + !fc->connected || atomic_read(&fc->num_waiting) == 0, + HZ) == 0) + touch_softlockup_watchdog(); +} + /* * Abort all requests. * @@ -2489,7 +2560,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2520,8 +2591,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2531,7 +2602,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2562,7 +2633,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2580,8 +2651,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2597,8 +2668,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2609,6 +2680,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2624,6 +2708,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2632,7 +2719,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1cc..97d7557a83554a 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,9 +7,11 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include #include +#include static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); @@ -17,7 +19,29 @@ MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ +#define FUSE_RING_HEADER_PG 0 +#define FUSE_RING_PAYLOAD_PG 1 +/* Threshold that determines if a better queue should be searched for */ +#define FUSE_URING_Q_THRESHOLD 2 + +/* Number of (re)tries to find a better queue */ +#define FUSE_URING_Q_TRIES 3 + +/* redfs only to allow patch backports */ +#define IO_URING_F_TASK_DEAD (1 << 13) + +#ifndef io_uring_cmd_to_pdu +static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) +{ + BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu)); +} +/* red specific backport */ +#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \ + io_uring_cmd_private_sz_check(sizeof(pdu_type)), \ + ((pdu_type *)&(cmd)->pdu) \ +) +#endif bool fuse_uring_enabled(void) { @@ -47,7 +71,7 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } -static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +static void fuse_uring_flush_queue_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; @@ -85,13 +109,14 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + queue->nr_reqs--; + list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } - spin_unlock(&queue->lock); if (error) @@ -111,19 +136,23 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); + queue->nr_reqs = 0; spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ fuse_dev_end_requests(&req_list); } -void fuse_uring_abort_end_requests(struct fuse_ring *ring) +void fuse_uring_flush_bg(struct fuse_conn *fc) { int qid; struct fuse_ring_queue *queue; - struct fuse_conn *fc = ring->fc; + struct fuse_ring *ring = fc->ring; - for (qid = 0; qid < ring->nr_queues; qid++) { + if (!ring) + return; + + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -133,10 +162,9 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); - fuse_uring_abort_end_queue_requests(queue); } } @@ -164,7 +192,7 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) if (!ring) return false; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -183,6 +211,40 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) return false; } +/* + * Copy from memmap.c, should be exported + */ +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + +static void fuse_ring_destruct_q_map(struct fuse_queue_map *q_map) +{ + free_cpumask_var(q_map->registered_q_mask); + kfree(q_map->cpu_to_qid); +} + +static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) +{ + int node; + + fuse_ring_destruct_q_map(&ring->q_map); + + if (ring->numa_q_map) { + for (node = 0; node < ring->nr_numa_nodes; node++) + fuse_ring_destruct_q_map(&ring->numa_q_map[node]); + kfree(ring->numa_q_map); + } +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -191,7 +253,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -206,6 +268,9 @@ void fuse_uring_destruct(struct fuse_conn *fc) list_for_each_entry_safe(ent, next, &queue->ent_released, list) { list_del_init(&ent->list); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + io_pages_free(&ent->payload_pages, + ent->nr_payload_pages); kfree(ent); } @@ -214,11 +279,47 @@ void fuse_uring_destruct(struct fuse_conn *fc) ring->queues[qid] = NULL; } + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); fc->ring = NULL; } +static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) +{ + if (!zalloc_cpumask_var(&q_map->registered_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), + GFP_KERNEL_ACCOUNT); + if (!q_map->cpu_to_qid) + return -ENOMEM; + + return 0; +} + +static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues) +{ + int err, node; + + err = fuse_uring_init_q_map(&ring->q_map, nr_queues); + if (err) + return err; + + ring->numa_q_map = kcalloc(ring->nr_numa_nodes, + sizeof(*ring->numa_q_map), + GFP_KERNEL_ACCOUNT); + if (!ring->numa_q_map) + return -ENOMEM; + for (node = 0; node < ring->nr_numa_nodes; node++) { + err = fuse_uring_init_q_map(&ring->numa_q_map[node], + nr_queues); + if (err) + return err; + } + return 0; +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -228,11 +329,14 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; + int err; ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); if (!ring) return NULL; + ring->nr_numa_nodes = num_online_nodes(); + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), GFP_KERNEL_ACCOUNT); if (!ring->queues) @@ -241,6 +345,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + err = fuse_uring_create_q_masks(ring, nr_queues); + if (err) + goto out_err; + spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ @@ -251,7 +359,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - ring->nr_queues = nr_queues; + ring->max_nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; smp_store_release(&fc->ring, ring); @@ -260,17 +368,42 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return ring; out_err: + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); return res; } +static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, + struct fuse_queue_map *q_map, + int node) +{ + int cpu, qid_idx, mapping_count = 0; + size_t nr_queues; + + cpumask_set_cpu(qid, q_map->registered_q_mask); + nr_queues = cpumask_weight(q_map->registered_q_mask); + for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { + if (node != -1 && cpu_to_node(cpu) != node) + continue; + + qid_idx = mapping_count % nr_queues; + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx, + q_map->registered_q_mask); + mapping_count++; + pr_debug("%s node=%d qid=%d qid_idx=%d nr_queues=%zu %d->%d\n", + __func__, node, qid, qid_idx, nr_queues, cpu, + q_map->cpu_to_qid[cpu]); + } +} + static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, int qid) { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; struct list_head *pq; + int node; queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); if (!queue) @@ -308,6 +441,22 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, * write_once and lock as the caller mostly doesn't take the lock at all */ WRITE_ONCE(ring->queues[qid], queue); + + /* Static mapping from cpu to per numa queues */ + node = cpu_to_node(qid); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node], node); + + /* + * smp_store_release, as the variable is read without fc->lock and + * we need to avoid compiler re-ordering of updating the nr_queues + * and setting ring->numa_queues[node].cpu_to_qid above + */ + smp_store_release (&ring->numa_q_map[node].nr_queues, + ring->numa_q_map[node].nr_queues + 1); + + /* global mapping */ + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map, -1); + spin_unlock(&fc->lock); return queue; @@ -323,11 +472,11 @@ static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) /* * Release a request/entry on connection tear down */ -static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent, int issue_flags) { struct fuse_req *req; struct io_uring_cmd *cmd; - + ssize_t queue_refs; struct fuse_ring_queue *queue = ent->queue; spin_lock(&queue->lock); @@ -351,19 +500,20 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); if (req) fuse_uring_stop_fuse_req_end(req); + + queue_refs = atomic_dec_return(&queue->ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); } static void fuse_uring_stop_list_entries(struct list_head *head, struct fuse_ring_queue *queue, enum fuse_ring_req_state exp_state) { - struct fuse_ring *ring = queue->ring; struct fuse_ring_ent *ent, *next; - ssize_t queue_refs = SSIZE_MAX; LIST_HEAD(to_teardown); spin_lock(&queue->lock); @@ -380,11 +530,8 @@ static void fuse_uring_stop_list_entries(struct list_head *head, spin_unlock(&queue->lock); /* no queue lock to avoid lock order issues */ - list_for_each_entry_safe(ent, next, &to_teardown, list) { - fuse_uring_entry_teardown(ent); - queue_refs = atomic_dec_return(&ring->queue_refs); - WARN_ON_ONCE(queue_refs < 0); - } + list_for_each_entry_safe(ent, next, &to_teardown, list) + fuse_uring_entry_teardown(ent, IO_URING_F_UNLOCKED); } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) @@ -403,7 +550,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) int qid; struct fuse_ring_ent *ent; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) @@ -422,6 +569,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } + spin_unlock(&queue->lock); } ring->stop_debug_log = 1; @@ -434,7 +582,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -468,16 +616,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; + int node; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; + fuse_uring_abort_end_queue_requests(queue); fuse_uring_teardown_entries(queue); } + /* Reset all queue masks, we won't process any more IO */ + cpumask_clear(ring->q_map.registered_q_mask); + for (node = 0; node < ring->nr_numa_nodes; node++) { + if (ring->numa_q_map) + cpumask_clear(ring->numa_q_map[node].registered_q_mask); + } + if (atomic_read(&ring->queue_refs) > 0) { ring->teardown_time = jiffies; INIT_DELAYED_WORK(&ring->async_teardown_work, @@ -500,7 +657,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, { struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue; - bool need_cmd_done = false; + bool teardown = false; /* * direct access on ent - it must not be destructed as long as @@ -509,17 +666,14 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, queue = ent->queue; spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - need_cmd_done = true; - ent->cmd = NULL; + ent->state = FRRS_TEARDOWN; + list_del_init(&ent->list); + teardown = true; } spin_unlock(&queue->lock); - if (need_cmd_done) { - /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); - } + if (teardown) + fuse_uring_entry_teardown(ent, issue_flags); } static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, @@ -596,13 +750,72 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, fuse_copy_init(&cs, false, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; - return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + fuse_copy_finish(&cs); + return err; } - /* - * Copy data from the req to the ring buffer - */ +/* + * Copy data from the req to the ring buffer + * In order to be able to write into the ring buffer from the application, + * i.e. to avoid io_uring_cmd_complete_in_task(), the header needs to be + * pinned as well. + */ +static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent, + struct fuse_uring_req_header *headers) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + fuse_copy_init(&cs, 1, NULL); + cs.is_uring = 1; + cs.req = req; + cs.ring.pages = ent->payload_pages; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + memcpy(&headers->op_in, in_args->value, in_args->size); + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + goto copy_finish; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + +copy_finish: + fuse_copy_finish(&cs); + return err; +} + +/* + * Copy data from the req to the ring buffer + */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { @@ -626,6 +839,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_init(&cs, true, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; if (num_args > 0) { /* @@ -650,12 +865,14 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); +copy_finish: + fuse_copy_finish(&cs); return err ? -EFAULT : 0; } @@ -665,6 +882,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; int err; + struct fuse_uring_req_header *headers = NULL; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { @@ -677,22 +895,29 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, if (WARN_ON(req->in.h.unique == 0)) return err; - /* copy the request */ - err = fuse_uring_args_to_ring(ring, req, ent); - if (unlikely(err)) { - pr_info_ratelimited("Copy to ring failed: %d\n", err); - return err; - } - /* copy fuse_in_header */ - err = copy_to_user(&ent->headers->in_out, &req->in.h, - sizeof(req->in.h)); - if (err) { - err = -EFAULT; - return err; + if (ent->header_pages) { + headers = kmap_local_page( + ent->header_pages[FUSE_RING_HEADER_PG]); + + memcpy(&headers->in_out, &req->in.h, sizeof(req->in.h)); + + err = fuse_uring_args_to_ring_pages(ring, req, ent, headers); + kunmap_local(headers); + } else { + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) + err = -EFAULT; } - return 0; + return err; } static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, @@ -709,6 +934,21 @@ static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, return err; } +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move_tail(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + trace_fuse_request_send(ent->fuse_req); + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + /* * Write data to the ring buffer and send the request to userspace, * userspace will read it @@ -718,22 +958,13 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, struct fuse_req *req, unsigned int issue_flags) { - struct fuse_ring_queue *queue = ent->queue; int err; - struct io_uring_cmd *cmd; err = fuse_uring_prepare_send(ent, req); if (err) return err; - spin_lock(&queue->lock); - cmd = ent->cmd; - ent->cmd = NULL; - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - spin_unlock(&queue->lock); - - io_uring_cmd_done(cmd, 0, 0, issue_flags); + fuse_uring_send(ent, ent->cmd, 0, issue_flags); return 0; } @@ -888,7 +1119,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, if (!ring) return err; - if (qid >= ring->nr_queues) + if (qid >= ring->max_nr_queues) return -EINVAL; queue = ring->queues[qid]; @@ -945,59 +1176,43 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -static bool is_ring_ready(struct fuse_ring *ring, int current_qid) -{ - int qid; - struct fuse_ring_queue *queue; - bool ready = true; - - for (qid = 0; qid < ring->nr_queues && ready; qid++) { - if (current_qid == qid) - continue; - - queue = ring->queues[qid]; - if (!queue) { - ready = false; - break; - } - - spin_lock(&queue->lock); - if (list_empty(&queue->ent_avail_queue)) - ready = false; - spin_unlock(&queue->lock); - } - - return ready; -} - /* - * fuse_uring_req_fetch command handling + * Copy from memmap.c, should be exported there */ -static void fuse_uring_do_register(struct fuse_ring_ent *ent, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +static struct page **io_pin_pages(unsigned long uaddr, unsigned long len, + int *npages) { - struct fuse_ring_queue *queue = ent->queue; - struct fuse_ring *ring = queue->ring; - struct fuse_conn *fc = ring->fc; - struct fuse_iqueue *fiq = &fc->iq; - - fuse_uring_prepare_cancel(cmd, issue_flags, ent); - - spin_lock(&queue->lock); - ent->cmd = cmd; - fuse_uring_ent_avail(ent, queue); - spin_unlock(&queue->lock); - - if (!ring->ready) { - bool ready = is_ring_ready(ring, queue->qid); + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } - if (ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; } + kvfree(pages); + return ERR_PTR(ret); } /* @@ -1026,6 +1241,59 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, return 0; } +static int fuse_uring_pin_pages(struct fuse_ring_ent *ent) +{ + struct fuse_ring *ring = ent->queue->ring; + int err; + + /* + * This needs to do locked memory accounting, for now privileged servers + * only. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* Pin header pages */ + if (!PAGE_ALIGNED(ent->headers)) { + pr_info_ratelimited("ent->headers is not page-aligned: %p\n", + ent->headers); + return -EINVAL; + } + + ent->header_pages = io_pin_pages((unsigned long)ent->headers, + sizeof(struct fuse_uring_req_header), + &ent->nr_header_pages); + if (IS_ERR(ent->header_pages)) { + err = PTR_ERR(ent->header_pages); + pr_info_ratelimited("Failed to pin header pages, err=%d\n", + err); + ent->header_pages = NULL; + return err; + } + + if (ent->nr_header_pages != 1) { + pr_info_ratelimited("Header pages not pinned as one page\n"); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->header_pages = NULL; + return -EINVAL; + } + + /* Pin payload pages */ + ent->payload_pages = io_pin_pages((unsigned long)ent->payload, + ring->max_payload_sz, + &ent->nr_payload_pages); + if (IS_ERR(ent->payload_pages)) { + err = PTR_ERR(ent->payload_pages); + pr_info_ratelimited("Failed to pin payload pages, err=%d\n", + err); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->payload_pages = NULL; + return err; + } + + return 0; +} + static struct fuse_ring_ent * fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_queue *queue) @@ -1067,6 +1335,12 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + err = fuse_uring_pin_pages(ent); + if (err) { + kfree(ent); + return ERR_PTR(err); + } + atomic_inc(&ring->queue_refs); return ent; } @@ -1082,6 +1356,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; + struct fuse_iqueue *fiq = &fc->iq; int err; unsigned int qid = READ_ONCE(cmd_req->qid); @@ -1092,7 +1367,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, return err; } - if (qid >= ring->nr_queues) { + if (qid >= ring->max_nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } @@ -1113,7 +1388,19 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, if (IS_ERR(ent)) return PTR_ERR(ent); - fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + if (!ring->ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + + spin_lock(&queue->lock); + ent->cmd = cmd; + spin_unlock(&queue->lock); + + /* Marks the ring entry as ready */ + fuse_uring_next_fuse_req(ent, queue, issue_flags); return 0; } @@ -1139,9 +1426,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; @@ -1189,20 +1476,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; } -static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, - ssize_t ret, unsigned int issue_flags) -{ - struct fuse_ring_queue *queue = ent->queue; - - spin_lock(&queue->lock); - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - ent->cmd = NULL; - spin_unlock(&queue->lock); - - io_uring_cmd_done(cmd, ret, 0, issue_flags); -} - /* * This prepares and sends the ring request in fuse-uring task context. * User buffers are not mapped yet - the application does not have permission @@ -1228,30 +1501,107 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, + bool background) { unsigned int qid; - struct fuse_ring_queue *queue; + int node, tries = 0; + unsigned int nr_queues; + unsigned int cpu = task_cpu(current); + struct fuse_ring_queue *queue, *primary_queue = NULL; - qid = task_cpu(current); + /* + * Background requests result in better performance on a different + * CPU, unless CPUs are already busy. + */ + if (background) + cpu++; - if (WARN_ONCE(qid >= ring->nr_queues, - "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->nr_queues)) - qid = 0; +retry: + cpu = cpu % ring->max_nr_queues; + + /* numa local registered queue bitmap */ + node = cpu_to_node(cpu); + if (WARN_ONCE(node >= ring->nr_numa_nodes, + "Node number (%d) exceeds nr nodes (%d)\n", + node, ring->nr_numa_nodes)) { + node = 0; + } - queue = ring->queues[qid]; - WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); + if (nr_queues) { + /* prefer the queue that corresponds to the current cpu */ + queue = READ_ONCE(ring->queues[cpu]); + if (queue) { + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + primary_queue = queue; + } - return queue; + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + if (qid != cpu) { + queue = READ_ONCE(ring->queues[qid]); + + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; + + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + } + + /* Retries help for load balancing */ + if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) { + if (!primary_queue) + primary_queue = queue; + + /* Increase cpu, assuming it will map to a different qid*/ + cpu++; + tries++; + goto retry; + } + } + + /* Retries exceeded, take the primary target queue */ + if (primary_queue) + return primary_queue; + + /* global registered queue bitmap */ + qid = ring->q_map.cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) { + /* Might happen on teardown */ + return NULL; + } + return READ_ONCE(ring->queues[qid]); } -static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) { struct io_uring_cmd *cmd = ent->cmd; - uring_cmd_set_ring_ent(cmd, ent); - io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + /* + * Task needed when pages are not pinned as the application doing IO + * is not allowed to write into fuse-server pages. + * Additionally for IO through io-uring as issue flags are unknown then. + * backgrounds requests might hold spin-locks, that conflict with + * io_uring_cmd_done() mutex lock. + */ + if (!ent->header_pages || current->io_uring || bg) { + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + } else { + int err = fuse_uring_prepare_send(ent, ent->fuse_req); + struct fuse_ring_queue *queue = ent->queue; + + if (err) { + fuse_uring_next_fuse_req(ent, queue, + IO_URING_F_UNLOCKED); + return; + } + fuse_uring_send(ent, cmd, 0, IO_URING_F_UNLOCKED); + } } /* queue a fuse request and send it if a ring entry is available */ @@ -1264,13 +1614,10 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring, false); if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); - spin_lock(&queue->lock); err = -ENOTCONN; if (unlikely(queue->stopped)) @@ -1280,14 +1627,17 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); + queue->nr_reqs++; + if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); if (ent) - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, false); return; @@ -1306,7 +1656,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring, true); if (!queue) return false; @@ -1319,6 +1669,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) set_bit(FR_URING, &req->flags); req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); + queue->nr_reqs++; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); @@ -1326,7 +1677,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); /* @@ -1340,7 +1691,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fuse_uring_add_req_to_ring_ent(ent, req); spin_unlock(&queue->lock); - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, true); } else { spin_unlock(&queue->lock); } @@ -1351,8 +1702,16 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) bool fuse_uring_remove_pending_req(struct fuse_req *req) { struct fuse_ring_queue *queue = req->ring_queue; + bool removed = fuse_remove_pending_req(req, &queue->lock); + + if (removed) { + /* Update counters after successful removal */ + spin_lock(&queue->lock); + queue->nr_reqs--; + spin_unlock(&queue->lock); + } - return fuse_remove_pending_req(req, &queue->lock); + return removed; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 51a563922ce141..4caf7626604c8e 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -40,7 +40,11 @@ enum fuse_ring_req_state { struct fuse_ring_ent { /* userspace buffer */ struct fuse_uring_req_header __user *headers; + struct page **header_pages; + int nr_header_pages; void __user *payload; + struct page **payload_pages; + int nr_payload_pages; /* the ring queue that owns the request */ struct fuse_ring_queue *queue; @@ -94,6 +98,9 @@ struct fuse_ring_queue { /* background fuse requests */ struct list_head fuse_req_bg_queue; + /* number of requests queued or in userspace */ + unsigned int nr_reqs; + struct fuse_pqueue fpq; unsigned int active_background; @@ -101,6 +108,17 @@ struct fuse_ring_queue { bool stopped; }; +struct fuse_queue_map { + /* Tracks which queues are registered */ + cpumask_var_t registered_q_mask; + + /* number of registered queues */ + size_t nr_queues; + + /* cpu to qid mapping */ + int *cpu_to_qid; +}; + /** * Describes if uring is for communication and holds alls the data needed * for uring communication @@ -110,7 +128,10 @@ struct fuse_ring { struct fuse_conn *fc; /* number of ring queues */ - size_t nr_queues; + size_t max_nr_queues; + + /* number of numa nodes */ + int nr_numa_nodes; /* maximum payload/arg size */ size_t max_payload_sz; @@ -122,6 +143,12 @@ struct fuse_ring { */ unsigned int stop_debug_log : 1; + /* per numa node queue tracking */ + struct fuse_queue_map *numa_q_map; + + /* all queue tracking */ + struct fuse_queue_map q_map; + wait_queue_head_t stop_waitq; /* async tear down */ @@ -138,7 +165,7 @@ struct fuse_ring { bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); -void fuse_uring_abort_end_requests(struct fuse_ring *ring); +void fuse_uring_flush_bg(struct fuse_conn *fc); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); @@ -153,7 +180,7 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) return; if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_abort_end_requests(ring); + fuse_uring_flush_bg(fc); fuse_uring_stop_queues(ring); } } @@ -206,6 +233,10 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc) return false; } +static inline void fuse_uring_flush_bg(struct fuse_conn *fc) +{ +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5c569c3cb53f3d..5e96dc72a63854 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -1306,14 +1307,7 @@ static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.opcode = FUSE_GETATTR; - args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + fuse_getattr_args_fill(&args, get_node_id(inode), &inarg, &outarg); err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || @@ -1989,6 +1983,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * truncation has already been done by OPEN. But still * need to truncate page cache. */ + if (fc->dlm && fc->writeback_cache) + fuse_dlm_cache_release_locks(fi); i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; @@ -2094,6 +2090,9 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (fc->dlm && fc->writeback_cache) + fuse_dlm_unlock_range(fi, outarg.attr.size & PAGE_MASK, -1); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a52cf1b9cfc650..f86f7ad1815ff6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -23,6 +24,39 @@ #include #include +/* + * Helper function to initialize fuse_args for OPEN/OPENDIR operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg) +{ + args->opcode = opcode; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + +/* + * Helper function to initialize fuse_args for GETATTR operations + */ +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg) +{ + args->opcode = FUSE_GETATTR; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, struct fuse_open_out *outargp) @@ -40,14 +74,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = opcode; - args.nodeid = nodeid; - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(*outargp); - args.out_args[0].value = outargp; + fuse_open_args_fill(&args, nodeid, opcode, &inarg, outargp); return fuse_simple_request(fm, &args); } @@ -124,8 +151,66 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) } } +static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, + int flags, int opcode, + struct fuse_file *ff, + struct fuse_attr_out *outattrp, + struct fuse_open_out *outopenp) +{ + struct fuse_compound_req *compound; + struct fuse_args open_args = {}; + struct fuse_args getattr_args = {}; + struct fuse_open_in open_in = {}; + struct fuse_getattr_in getattr_in = {}; + int err; + + compound = fuse_compound_alloc(fm, 0); + if (IS_ERR(compound)) + return PTR_ERR(compound); + + open_in.flags = flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) + open_in.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (open_in.flags & O_TRUNC) && !capable(CAP_FSETID)) + open_in.open_flags |= FUSE_OPEN_KILL_SUIDGID; + + fuse_open_args_fill(&open_args, nodeid, opcode, &open_in, outopenp); + + err = fuse_compound_add(compound, &open_args); + if (err) + goto out; + + fuse_getattr_args_fill(&getattr_args, nodeid, &getattr_in, outattrp); + + err = fuse_compound_add(compound, &getattr_args); + if (err) + goto out; + + err = fuse_compound_send(compound); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 0); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 1); + if (err) + goto out; + + ff->fh = outopenp->fh; + ff->open_flags = outopenp->open_flags; + +out: + kfree(compound); + return err; +} + struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir) + struct inode *inode, + unsigned int open_flags, bool isdir) { struct fuse_conn *fc = fm->fc; struct fuse_file *ff; @@ -142,23 +227,46 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, if (open) { /* Store outarg for fuse_finish_open() */ struct fuse_open_out *outargp = &ff->args->open_outarg; - int err; + int err = -ENOSYS; + + if (inode && fc->compound_open_getattr) { + struct fuse_attr_out attr_outarg; + + err = fuse_compound_open_getattr(fm, nodeid, open_flags, + opcode, ff, + &attr_outarg, outargp); + if (err == -ENOSYS) + fc->compound_open_getattr = 0; + if (!err) + fuse_change_attributes(inode, &attr_outarg.attr, + NULL, + ATTR_TIMEOUT(&attr_outarg), + fuse_get_attr_version(fc)); + } + if (err == -ENOSYS) { + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); + if (!err) { + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; + } + } - err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); - if (!err) { - ff->fh = outargp->fh; - ff->open_flags = outargp->open_flags; - } else if (err != -ENOSYS) { - fuse_file_free(ff); - return ERR_PTR(err); - } else { - /* No release needed */ - kfree(ff->args); - ff->args = NULL; - if (isdir) - fc->no_opendir = 1; - else - fc->no_open = 1; + if (err) { + if (err != -ENOSYS) { + /* err is not ENOSYS */ + fuse_file_free(ff); + return ERR_PTR(err); + } else { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; + + /* we don't have open */ + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; + } } } @@ -173,11 +281,10 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { - struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + struct fuse_file *ff = fuse_file_open(fm, nodeid, file_inode(file), file->f_flags, isdir); if (!IS_ERR(ff)) file->private_data = ff; - return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); @@ -655,6 +762,18 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; + + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + if (io->write && res && mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + io->offset >> PAGE_SHIFT, + (io->offset + res - 1) >> PAGE_SHIFT); + } spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); @@ -823,8 +942,11 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); - if (res < 0) + if (res < 0) { + if (res == -EAGAIN) + res = AOP_TRUNCATED_PAGE; return res; + } /* * Short read means EOF. If file size is larger, truncate it */ @@ -1075,9 +1197,11 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, { struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; + ssize_t written; ssize_t err; fuse_write_args_fill(ia, ff, pos, count); @@ -1091,10 +1215,26 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, return fuse_async_req_send(fm, ia, count); err = fuse_simple_request(fm, &ia->ap.args); - if (!err && ia->write.out.size > count) + written = ia->write.out.size; + if (!err && written > count) err = -EIO; - return err ?: ia->write.out.size; + /* + * Without FOPEN_DIRECT_IO, generic_file_direct_write() does the + * invalidation for us. + */ + if (!err && written && mapping->nrpages && + (ff->open_flags & FOPEN_DIRECT_IO)) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + (pos + written - 1) >> PAGE_SHIFT); + } + + return err ?: written; } bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) @@ -1434,6 +1574,27 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!fc->handle_killpriv_v2 || !setattr_should_drop_suidgid(idmap, file_inode(file))) writeback = true; + + /* + * If we have dlm support acquire the lock for the area + * we are writing into. + */ + if (fc->dlm) { + /* + * Note that a file opened with O_APPEND will have + * relative values in ki_pos. This code is here for + * convenience and for libfuse overlay test. + * Filesystems should handle O_APPEND with 'direct io' + * to additionally get the performance benefits of + * 'parallel direct writes'. + */ + loff_t pos = file->f_flags & O_APPEND ? + i_size_read(inode) + iocb->ki_pos : + iocb->ki_pos; + size_t length = iov_iter_count(from); + + fuse_get_dlm_write_lock(file, pos, length); + } } inode_lock(inode); @@ -1453,7 +1614,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from)) goto out; written = direct_write_fallback(iocb, from, written, - fuse_perform_write(iocb, from)); + fuse_perform_write(iocb, from)); } else if (writeback) { /* * Use iomap so that we can do granular uptodate reads @@ -1609,7 +1770,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_allow_mmap) { + if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); @@ -2101,9 +2262,12 @@ static void fuse_writepages_send(struct inode *inode, spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, - unsigned len, struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) + +static bool fuse_writepage_need_send(struct fuse_conn *fc, + loff_t pos, unsigned len, + struct fuse_args_pages *ap, + struct fuse_fill_wb_data *data, + struct writeback_control *wbc) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; @@ -2132,6 +2296,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, !fuse_pages_realloc(data, fc->max_pages)) return true; + /* Reached alignment */ + if (fc->alignment_pages) { + unsigned int total_pages = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t page_index = pos >> PAGE_SHIFT; + + if (!(page_index % fc->alignment_pages)) { + pgoff_t end_page_index = (wbc->range_end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* we are at a point where we would write aligned + * check if we potentially could reach the next alignment */ + if (page_index + fc->alignment_pages > end_page_index) + return true; + + if (total_pages + fc->alignment_pages > fc->max_pages) + return true; + } + } return false; } @@ -2155,7 +2336,7 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, return -EIO; } - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data, wpc->wbc)) { fuse_writepages_send(inode, data); data->wpa = NULL; data->nr_bytes = 0; @@ -2261,6 +2442,60 @@ static void fuse_vma_close(struct vm_area_struct *vma) mapping_set_error(vma->vm_file->f_mapping, err); } +/** + * Request a DLM lock from the FUSE server. + * + * This routine is similar to fuse_get_dlm_write_lock(), but it + * does not cache the DLM lock in the kernel. + */ +static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = ff->fm; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + if (WARN_ON_ONCE((offset & ~PAGE_MASK) || (length & ~PAGE_MASK))) + return -EIO; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.start = offset; + inarg.end = offset + length - 1; + inarg.type = FUSE_DLM_PAGE_MKWRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->dlm = 0; + err = 0; + } + + if (!err && + fc->dlm && + (outarg.start > inarg.start || + outarg.end < inarg.end)) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu:%llu bytes returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); + fuse_abort_conn(fc); + err = -EINVAL; + } + return err; +} /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly @@ -2279,7 +2514,18 @@ static void fuse_vma_close(struct vm_area_struct *vma) static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); - struct inode *inode = file_inode(vmf->vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + + if (fm->fc->dlm) { + loff_t pos = vmf->pgoff << PAGE_SHIFT; + size_t length = PAGE_SIZE; + int err = fuse_get_page_mkwrite_lock(file, pos, length); + if (err < 0) { + return vmf_error(err); + } + } file_update_time(vmf->vma->vm_file); folio_lock(folio); @@ -3119,6 +3365,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); + fuse_dlm_cache_init(fi); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); @@ -3126,4 +3373,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); + + mapping_set_large_folios(inode->i_mapping); } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a3193d..4037fd7bdeee66 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -34,18 +36,27 @@ struct fuse_copy_state { bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ + struct page **pages; + int page_idx; } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); @@ -53,6 +64,7 @@ void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); +void fuse_copy_finish(struct fuse_copy_state *cs); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c new file mode 100644 index 00000000000000..d765dd8018cc6a --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FUSE page lock cache implementation + */ +#include "fuse_i.h" +#include "fuse_dlm_cache.h" + +#include +#include +#include +#include + + +/* A range of pages with a lock */ +struct fuse_dlm_range { + /* Interval tree node */ + struct rb_node rb; + /* Start page offset (inclusive) */ + uint64_t start; + /* End page offset (inclusive) */ + uint64_t end; + /* Subtree end value for interval tree */ + uint64_t __subtree_end; + /* Lock mode */ + enum fuse_page_lock_mode mode; + /* Temporary list entry for operations */ + struct list_head list; +}; + +/* Lock modes for FUSE page cache */ +#define FUSE_PCACHE_LK_READ 1 /* Shared read lock */ +#define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ + +/* Interval tree definitions for page ranges */ +static inline uint64_t fuse_dlm_range_start(struct fuse_dlm_range *range) +{ + return range->start; +} + +static inline uint64_t fuse_dlm_range_last(struct fuse_dlm_range *range) +{ + return range->end; +} + +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, uint64_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); + +/** + * fuse_page_cache_init - Initialize a page cache lock manager + * @cache: The cache to initialize + * + * Initialize a page cache lock manager for a FUSE inode. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_cache_init(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + + if (!cache) + return -EINVAL; + + init_rwsem(&cache->lock); + cache->ranges = RB_ROOT_CACHED; + + return 0; +} + +/** + * fuse_page_cache_destroy - Clean up a page cache lock manager + * @cache: The cache to clean up + * + * Release all locks and free all resources associated with the cache. + */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + struct rb_node *node; + + if (!cache) + return; + + /* Release all locks */ + down_write(&cache->lock); + while ((node = rb_first_cached(&cache->ranges)) != NULL) { + range = rb_entry(node, struct fuse_dlm_range, rb); + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + up_write(&cache->lock); +} + +/** + * fuse_dlm_find_overlapping - Find a range that overlaps with [start, end] + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Return: Pointer to the first overlapping range, or NULL if none found + */ +static struct fuse_dlm_range * +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) +{ + return fuse_page_it_iter_first(&cache->ranges, start, end); +} + +/** + * fuse_page_try_merge - Try to merge ranges within a specific region + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Attempt to merge ranges within and adjacent to the specified region + * that have the same lock mode. + */ +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) +{ + struct fuse_dlm_range *range, *next; + struct rb_node *node; + + if (!cache) + return; + + /* Find the first range that might need merging */ + range = NULL; + node = rb_first_cached(&cache->ranges); + while (node) { + range = rb_entry(node, struct fuse_dlm_range, rb); + if (range->end >= start - 1) + break; + node = rb_next(node); + } + + if (!range || range->start > end + 1) + return; + + /* Try to merge ranges in and around the specified region */ + while (range && range->start <= end + 1) { + /* Get next range before we potentially modify the tree */ + next = NULL; + if (rb_next(&range->rb)) { + next = rb_entry(rb_next(&range->rb), + struct fuse_dlm_range, rb); + } + + /* Try to merge with next range if adjacent and same mode */ + if (next && range->mode == next->mode && + range->end + 1 == next->start) { + /* Merge ranges */ + range->end = next->end; + + /* Remove next from tree */ + fuse_page_it_remove(next, &cache->ranges); + kfree(next); + + /* Continue with the same range */ + continue; + } + + /* Move to next range */ + range = next; + } +} + +/** + * fuse_dlm_lock_range - Lock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode (read or write) + * + * Add a locked range on the specified range of pages. + * If parts of the range are already locked, only add the remaining parts. + * For overlapping ranges, handle lock compatibility: + * - READ locks are compatible with existing READ locks + * - READ locks are compatible with existing WRITE locks (downgrade not needed) + * - WRITE locks need to upgrade existing READ locks + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *new_range, *next; + int lock_mode; + int ret = 0; + LIST_HEAD(to_lock); + LIST_HEAD(to_upgrade); + uint64_t current_start = start; + + if (!cache || start > end) + return -EINVAL; + + /* Convert to lock mode */ + lock_mode = (mode == FUSE_PAGE_LOCK_READ) ? FUSE_PCACHE_LK_READ : + FUSE_PCACHE_LK_WRITE; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check lock compatibility */ + if (lock_mode == FUSE_PCACHE_LK_WRITE && + lock_mode != range->mode) { + /* we own the lock but have to update it. */ + list_add_tail(&range->list, &to_upgrade); + } + /* If WRITE lock already exists - nothing to do */ + + /* If there's a gap before this range, we need to add the missing range */ + if (current_start < range->start) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = range->start - 1; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* Move current_start past this range */ + current_start = max(current_start, range->end + 1); + + /* Move to next range */ + range = next; + } + + /* If there's a gap after the last range to the end, extend the range */ + if (current_start <= end) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = end; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* update locks, if any lock is in this list it has the wrong mode */ + list_for_each_entry(range, &to_upgrade, list) { + /* Update the lock mode */ + range->mode = lock_mode; + } + + /* Add all new ranges to the tree */ + list_for_each_entry(new_range, &to_lock, list) { + /* Add to interval tree */ + fuse_page_it_insert(new_range, &cache->ranges); + } + + /* Try to merge adjacent ranges with the same mode */ + fuse_dlm_try_merge(cache, start, end); + + up_write(&cache->lock); + return 0; + +out_free: + /* Free any ranges we allocated but didn't insert */ + while (!list_empty(&to_lock)) { + new_range = + list_first_entry(&to_lock, struct fuse_dlm_range, list); + list_del(&new_range->list); + kfree(new_range); + } + + /* Restore original lock modes for any partially upgraded locks */ + list_for_each_entry(range, &to_upgrade, list) { + if (lock_mode == FUSE_PCACHE_LK_WRITE) { + /* We upgraded this lock but failed later, downgrade it back */ + range->mode = FUSE_PCACHE_LK_READ; + } + } + + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_punch_hole - Punch a hole in a locked range + * @cache: The page cache + * @start: Start page offset of the hole + * @end: End page offset of the hole + * + * Create a hole in a locked range by splitting it into two ranges. + * + * Return: 0 on success, negative error code on failure + */ +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) +{ + struct fuse_dlm_range *range, *new_range; + int ret = 0; + + if (!cache || start > end) + return -EINVAL; + + /* Find a range that contains [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + if (!range) { + ret = -EINVAL; + goto out; + } + + /* If the hole is at the beginning of the range */ + if (start == range->start) { + range->start = end + 1; + goto out; + } + + /* If the hole is at the end of the range */ + if (end == range->end) { + range->end = start - 1; + goto out; + } + + /* The hole is in the middle, need to split */ + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out; + } + + /* Copy properties from original range */ + *new_range = *range; + INIT_LIST_HEAD(&new_range->list); + + /* Adjust ranges */ + new_range->start = end + 1; + range->end = start - 1; + + /* Update interval tree */ + fuse_page_it_remove(range, &cache->ranges); + fuse_page_it_insert(range, &cache->ranges); + fuse_page_it_insert(new_range, &cache->ranges); + +out: + return ret; +} + +/** + * fuse_dlm_unlock_range - Unlock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Release locks on the specified range of pages. + * Note that if start and end are set to zero the cache is destroyed. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, + uint64_t start, uint64_t end) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *next; + int ret = 0; + + if (!cache) + return -EINVAL; + + if (start == 0 && end == 0) { + fuse_dlm_cache_release_locks(inode); + return 0; + } + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check if we need to punch a hole */ + if (start > range->start && end < range->end) { + /* Punch a hole in the middle */ + ret = fuse_dlm_punch_hole(cache, start, end); + if (ret) + goto out; + /* After punching a hole, we're done */ + break; + } else if (start > range->start) { + /* Adjust the end of the range */ + range->end = start - 1; + } else if (end < range->end) { + /* Adjust the start of the range */ + range->start = end + 1; + } else { + /* Complete overlap, remove the range */ + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + + range = next; + } + +out: + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_range_is_locked - Check if a page range is already locked + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode to check for (or NULL to check for any lock) + * + * Check if the specified range of pages is already locked. + * The entire range must be locked for this to return true. + * + * Return: true if the entire range is locked, false otherwise + */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + int lock_mode = 0; + uint64_t current_start = start; + + if (!cache || start > end) + return false; + + /* Convert to lock mode if specified */ + if (mode == FUSE_PAGE_LOCK_READ) + lock_mode = FUSE_PCACHE_LK_READ; + else if (mode == FUSE_PAGE_LOCK_WRITE) + lock_mode = FUSE_PCACHE_LK_WRITE; + + down_read(&cache->lock); + + /* Find the first range that overlaps with [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + + /* Check if the entire range is covered */ + while (range && current_start <= end) { + /* If we're checking for a specific mode, verify it matches */ + if (lock_mode && range->mode != lock_mode) { + /* Wrong lock mode */ + up_read(&cache->lock); + return false; + } + + /* Check if there's a gap before this range */ + if (current_start < range->start) { + /* Found a gap */ + up_read(&cache->lock); + return false; + } + + /* Move current_start past this range */ + current_start = range->end + 1; + + /* Get next overlapping range */ + range = fuse_page_it_iter_next(range, start, end); + } + + /* Check if we covered the entire range */ + if (current_start <= end) { + /* There's a gap at the end */ + up_read(&cache->lock); + return false; + } + + up_read(&cache->lock); + return true; +} + +/** + * request a dlm lock from the fuse server + */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + uint64_t end = (offset + length - 1) | (PAGE_SIZE - 1); + + /* note that the offset and length don't have to be page aligned here + * but since we only get here on writeback caching we will send out + * page aligned requests */ + offset &= PAGE_MASK; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + /* note that this can be run from different processes + * at the same time. It is intentionally not protected + * since a DLM implementation in the FUSE server should take care + * of any races in lock requests */ + if (fuse_dlm_range_is_locked(fi, offset, + end, FUSE_PAGE_LOCK_WRITE)) + return; /* we already have this area locked */ + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.start = offset; + inarg.end = end; + inarg.type = FUSE_DLM_LOCK_WRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + /* fuse server does not support dlm, save the info */ + fc->dlm = 0; + return; + } + + if (err) + return; + else + if (inarg.start < outarg.start || + inarg.end > outarg.end) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu:%llu returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); + fuse_abort_conn(fc); + return; + } else { + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, outarg.start, + outarg.end, + FUSE_PAGE_LOCK_WRITE); + } +} diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h new file mode 100644 index 00000000000000..438d31d28b666e --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * FUSE page cache lock implementation + */ + +#ifndef _FS_FUSE_DLM_CACHE_H +#define _FS_FUSE_DLM_CACHE_H + +#include +#include +#include +#include + + +struct fuse_inode; + +/* Lock modes for page ranges */ +enum fuse_page_lock_mode { FUSE_PAGE_LOCK_READ, FUSE_PAGE_LOCK_WRITE }; + +/* Page cache lock manager */ +struct fuse_dlm_cache { + /* Lock protecting the tree */ + struct rw_semaphore lock; + /* Interval tree of locked ranges */ + struct rb_root_cached ranges; +}; + +/* Initialize a page cache lock manager */ +int fuse_dlm_cache_init(struct fuse_inode *inode); + +/* Clean up a page cache lock manager */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode); + +/* Lock a range of pages */ +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); + +/* Unlock a range of pages */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end); + +/* Check if a page range is already locked */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); + +/* this is the interface to the filesystem */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length); + +#endif /* _FS_FUSE_DLM_CACHE_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e14..fa554029fa8cf6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -31,6 +31,7 @@ #include #include #include +#include "fuse_dlm_cache.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -106,6 +107,17 @@ struct fuse_backing { struct rcu_head rcu; }; +/** + * data structure to save the information that we have + * requested dlm locks for the given area from the fuse server +*/ +struct dlm_locked_area +{ + struct list_head list; + loff_t offset; + size_t size; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -161,6 +173,9 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; + + /* dlm locked areas we have sent lock requests for */ + struct fuse_dlm_cache dlm_locked_areas; }; /* readdir cache (directory only) */ @@ -758,6 +773,12 @@ struct fuse_conn { */ unsigned handle_killpriv_v2:1; + /* invalidate inode entries when doing inode invalidation */ + unsigned inval_inode_entries:1; + + /* expire inode entries when doing inode invalidation */ + unsigned expire_inode_entries:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -892,6 +913,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* do we have support for dlm in the fs? */ + unsigned int dlm:1; + /** Passthrough support for read/write IO */ unsigned int passthrough:1; @@ -901,11 +925,17 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; /** Maximum stack depth for passthrough backing files */ int max_stack_depth; + + /* Does the filesystem support compound operations? */ + unsigned int compound_open_getattr:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -983,6 +1013,10 @@ struct fuse_conn { * inode->i_blkbits. */ u8 blkbits; + + /* The foffset alignment in PAGE */ + unsigned int alignment_pages; + }; /* @@ -1160,6 +1194,14 @@ struct fuse_io_args { void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); +/* + * Helper functions to initialize fuse_args for common operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg); +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg); struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); @@ -1254,6 +1296,18 @@ static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); +/** + * Compound request API + */ +struct fuse_compound_req; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, uint32_t flags); +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args); +ssize_t fuse_compound_send(struct fuse_compound_req *compound); +int fuse_compound_get_error(struct fuse_compound_req * compound, + int op_idx); + /** * End a finished request */ @@ -1266,6 +1320,12 @@ void fuse_wait_aborted(struct fuse_conn *fc); /* Check if any requests timed out */ void fuse_check_timeout(struct work_struct *work); +/** + * Flush all pending requests and wait for them. Takes an optional timeout + * in jiffies. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout); + /** * Invalidate inode attributes */ @@ -1315,7 +1375,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -1508,7 +1568,9 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); /* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir); + struct inode *inode, + unsigned int open_flags, + bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index bbe9ddd8c71696..e81c93b9614627 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,6 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* @@ -77,30 +78,55 @@ OPCODES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -TRACE_EVENT(fuse_request_send, +#define FUSE_REQ_TRACE_FIELDS \ + __field(dev_t, connection) \ + __field(uint64_t, unique) \ + __field(enum fuse_opcode, opcode) \ + __field(uint32_t, len) \ + +#define FUSE_REQ_TRACE_ASSIGN(req) \ + do { \ + __entry->connection = req->fm->fc->dev; \ + __entry->unique = req->in.h.unique; \ + __entry->opcode = req->in.h.opcode; \ + __entry->len = req->in.h.len; \ + } while (0) + + +TRACE_EVENT(fuse_request_enqueue, TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_bg_enqueue, + TP_PROTO(const struct fuse_req *req), TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), - TP_STRUCT__entry( - __field(dev_t, connection) - __field(uint64_t, unique) - __field(enum fuse_opcode, opcode) - __field(uint32_t, len) - ), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); - TP_fast_assign( - __entry->connection = req->fm->fc->dev; - __entry->unique = req->in.h.unique; - __entry->opcode = req->in.h.opcode; - __entry->len = req->in.h.len; - ), +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), TP_printk("connection %u req %llu opcode %u (%s) len %u ", __entry->connection, __entry->unique, __entry->opcode, __print_symbolic(__entry->opcode, OPCODES), __entry->len) ); + TRACE_EVENT(fuse_request_end, TP_PROTO(const struct fuse_req *req), diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7c0403a002e759..ada68233879740 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,8 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include @@ -31,13 +33,19 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); +static bool __read_mostly enable_compound; +module_param(enable_compound, bool, 0644); +MODULE_PARM_DESC(enable_uring, "Enable fuse compounds"); + static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); -unsigned int fuse_max_pages_limit = 256; +unsigned int fuse_max_pages_limit = 4097; + /* default is no timeout */ unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; @@ -193,6 +201,7 @@ static void fuse_evict_inode(struct inode *inode) WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); + fuse_dlm_cache_release_locks(fi); } } @@ -551,6 +560,45 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, return NULL; } +static void fuse_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); +} + +static void fuse_invalidate_inode_entry(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + /* For directories, use d_invalidate to handle children and submounts */ + dentry = d_find_alias(inode); + if (dentry) { + d_invalidate(dentry); + fuse_invalidate_entry_cache(dentry); + dput(dentry); + } + } else { + /* For regular files, just unhash the dentry */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + } +} + int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { @@ -568,6 +616,11 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); + if (fc->inval_inode_entries) + fuse_invalidate_inode_entry(inode); + else if (fc->expire_inode_entries) + fuse_prune_aliases(inode); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -576,6 +629,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; + + if (fc->dlm && fc->writeback_cache) + /* Invalidate the range exactly as the fuse server requested + * except for the case where it sends -1. + * Note that this can lead to some inconsistencies if + * the fuse server sends unaligned data */ + fuse_dlm_unlock_range(fi, + offset, + pg_end == -1 ? 0 : + (offset + len - 1)); + invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -977,6 +1041,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->blocked = 0; fc->initialized = 0; fc->connected = 1; + fc->dlm = 1; + + /* module option for now */ + fc->compound_open_getattr = enable_compound; + atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); @@ -1427,6 +1496,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } + + if (flags & FUSE_ALIGN_PG_ORDER) { + if (arg->align_page_order > 0) { + fc->alignment_pages = + (1UL << arg->align_page_order) + >> PAGE_SHIFT; + } + } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; if (flags & FUSE_ALLOW_IDMAP) { @@ -1440,6 +1517,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if (flags & FUSE_REQUEST_TIMEOUT) timeout = arg->request_timeout; + if (flags & FUSE_INVAL_INODE_ENTRY) + fc->inval_inode_entries = 1; + if (flags & FUSE_EXPIRE_INODE_ENTRY) + fc->expire_inode_entries = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1448,7 +1529,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, init_server_timeout(fc, timeout); - fm->sb->s_bdi->ra_pages = + if (CAP_SYS_ADMIN) + fm->sb->s_bdi->ra_pages = ra_pages; + else + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; @@ -1466,7 +1550,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1476,8 +1560,7 @@ void fuse_send_init(struct fuse_mount *fm) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - flags = - FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -1490,7 +1573,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | - FUSE_REQUEST_TIMEOUT; + FUSE_REQUEST_TIMEOUT | FUSE_INVAL_INODE_ENTRY | + FUSE_EXPIRE_INODE_ENTRY | FUSE_URING_REDUCED_Q; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1525,10 +1609,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1869,8 +1973,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1878,8 +1986,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1900,6 +2010,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1920,8 +2031,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1982,7 +2095,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); @@ -2053,6 +2166,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + fuse_flush_requests(fc, 30 * HZ); if (fc->destroy) fuse_send_destroy(fm); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fdc175e93f7474..07a02e47b2c3a6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -494,7 +494,7 @@ static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); - return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); + return fuse_file_open(fm, get_node_id(inode), NULL, O_RDONLY, isdir); } static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 38051e5fba19ba..fc479e9aef9599 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1482,9 +1482,6 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); - clear_bit(FR_PENDING, &req->flags); fs = fiq->priv; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 122d6586e8d4da..3cb85385cb87f5 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -443,6 +443,12 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. * init_out.request_timeout contains the timeout (in secs) + * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation + * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation + * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for + * optimal io-size alignment + * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free + * to register between 1 and nr-core io-uring queues */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -491,6 +497,11 @@ struct fuse_file_lock { #define FUSE_OVER_IO_URING (1ULL << 41) #define FUSE_REQUEST_TIMEOUT (1ULL << 42) +#define FUSE_ALIGN_PG_ORDER (1ULL << 50) +#define FUSE_URING_REDUCED_Q (1ULL << 59) +#define FUSE_INVAL_INODE_ENTRY (1ULL << 60) +#define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) + /** * CUSE INIT request/reply flags * @@ -658,6 +669,16 @@ enum fuse_opcode { FUSE_TMPFILE = 51, FUSE_STATX = 52, + /* Operations which have not been merged into upstream */ + FUSE_DLM_WB_LOCK = 100, + + /* A compound request works like multiple simple requests. + * This is a special case for calls that can be combined atomic on the + * fuse server. If the server actually does atomically execute the command is + * left to the fuse server implementation. + */ + FUSE_COMPOUND = 101, + /* CUSE specific operations */ CUSE_INIT = 4096, @@ -905,6 +926,9 @@ struct fuse_init_in { #define FUSE_COMPAT_INIT_OUT_SIZE 8 #define FUSE_COMPAT_22_INIT_OUT_SIZE 24 +/* + * align_page_order: Number of pages for optimal IO, or a multiple of that + */ struct fuse_init_out { uint32_t major; uint32_t minor; @@ -919,7 +943,9 @@ struct fuse_init_out { uint32_t flags2; uint32_t max_stack_depth; uint16_t request_timeout; - uint16_t unused[11]; + uint8_t align_page_order; + uint8_t padding; + uint16_t unused[10]; }; #define CUSE_INIT_INFO_MAX 4096 @@ -1126,6 +1152,7 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3) struct fuse_lseek_in { uint64_t fh; @@ -1227,6 +1254,74 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Type of the dlm lock requested + */ +enum fuse_dlm_lock_type { + FUSE_DLM_LOCK_NONE = 0, + FUSE_DLM_LOCK_READ = 1, + FUSE_DLM_LOCK_WRITE = 2, + FUSE_DLM_PAGE_MKWRITE = 3, +}; + +/** + * struct fuse_dlm_lock_in - Lock request + * @fh: file handle + * @offset: offset into the file + * @size: size of the locked region + * @type: type of lock + */ +struct fuse_dlm_lock_in { + uint64_t fh; + uint64_t start; + uint64_t end; + uint32_t type; + uint32_t reserved; +}; + + +/** + * struct fuse_dlm_lock_out - Lock response + * @locksize: how many bytes where locked by the call + * (most of the time we want to lock more than is requested + * to reduce number of calls) + */ +struct fuse_dlm_lock_out { + uint64_t start; + uint64_t end; + uint64_t reserved; +}; + +/* + * Compound request header + * + * This header is followed by the fuse requests + */ +struct fuse_compound_in { + uint32_t count; /* Number of operations */ + uint32_t flags; /* Compound flags */ + + /* Total size of all results. + * This is needed for preallocating the whole result for all + * commands in this compound. + */ + uint32_t result_size; + uint64_t reserved; +}; + +/* + * Compound response header + * + * This header is followed by complete fuse responses + */ +struct fuse_compound_out { + uint32_t count; /* Number of results */ + uint32_t flags; /* Result flags */ + uint64_t reserved; +}; + +#define FUSE_MAX_COMPOUND_OPS 16 /* Maximum operations per compound */ + /** * Size of the ring buffer header */