From 4195a8100ed764fcd8bbb9654724c05ca7e644e7 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 15 Jun 2026 07:56:02 +0200 Subject: [PATCH 1/3] fuse: drop BDI_CAP_STRICTLIMIT from fuse bdi setup Signed-off-by: Horst Birthelmer --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a2d496160da4c5..259daca77475fe 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1745,7 +1745,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) /* fuse does it's own writeback accounting */ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; - sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + From 5740a071669398e676b2fd317214b9ad63548e6a Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 15 Jun 2026 07:57:01 +0200 Subject: [PATCH 2/3] fuse: use writethrough for writes matching the server alignment Writes that already match the alignment advertised via FUSE_ALIGN_PG_ORDER gain nothing from the writeback cache and can degrade into page-sized WRITE requests under dirty throttling. Send them through fuse_perform_write() instead, which packs requests up to max_write and keeps them stripe-aligned for the backend. They create no dirty pages, so no DLM write lock needs to be cached for them. Unaligned writes keep using the writeback cache. Also clarify in the uapi header that align_page_order is the log2 of the alignment in bytes, not in pages. Signed-off-by: Horst Birthelmer --- fs/fuse/file.c | 39 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fuse.h | 7 ++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 55ab5e9cf61715..3423db2ffe7e76 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1464,6 +1464,42 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) } } + +/* + * With writeback caching the request size seen by the server depends on + * how many contiguous dirty pages the flusher finds, which is bounded by + * dirty throttling: with BDI_CAP_STRICTLIMIT the dirty window can degrade + * to a single page under streaming writes, turning large application + * writes into page-sized requests. + * + * Writes that already match the server's preferred alignment gain + * nothing from accumulating in the page cache, so send them through + * fuse_perform_write() instead, which packs requests up to max_write. + * They create no dirty pages, hence no DLM write lock needs to be cached + * for them. Unaligned writes keep using the writeback cache, where they + * can merge with neighbouring data. + */ +static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb, + struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + u64 align; + bool ret; + + if (!fc->big_writes) + return true; + + /* these rely on the semantics of their current paths */ + if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT)) + return true; + + align = fc->alignment_pages ? + (u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE; + + ret = !IS_ALIGNED(iocb->ki_pos, align) || !IS_ALIGNED((u64)count, align); + return ret; +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1486,6 +1522,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) goto writethrough; } + if (!fuse_use_writeback_cache(fc, iocb, from)) + goto writethrough; + /* if we have dlm support acquire the lock for the area * we are writing into */ if (fc->dlm) { diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d472df6370a400..e2b2c68b162d43 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -429,8 +429,8 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation - * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for - * optimal io-size alignment + * FUSE_ALIGN_PG_ORDER: alignment order (power of 2 exponent of the IO size + * in bytes) for optimal io-size alignment * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free * to register between 1 and nr-core io-uring queues */ @@ -926,7 +926,8 @@ struct fuse_init_in { #define FUSE_COMPAT_22_INIT_OUT_SIZE 24 /* - * align_page_order: Number of pages for optimal IO, or a multiple of that + * align_page_order: log2 of the optimal IO size in bytes; IO is optimal + * when sized and aligned to (1 << align_page_order) or a multiple of it */ struct fuse_init_out { uint32_t major; From c1b073fb0159212a8742b680b983ae7a9d875f8d Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 15 Jun 2026 12:16:36 +0200 Subject: [PATCH 3/3] fuse: add writethrough_threshold knob to bypass the writeback cache Add a per-connection size threshold, settable via fusectl as writethrough_threshold, that sends buffered writes >= threshold through fuse_perform_write() regardless of alignment. The knob is off by default (0 == disabled) and leaves the existing alignment-based decision in place for writes below the threshold. Signed-off-by: Horst Birthelmer --- fs/fuse/control.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++- fs/fuse/file.c | 5 ++++ fs/fuse/fuse_i.h | 5 +++- 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 284a3500646296..39ef00e127b37a 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -184,6 +184,68 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, return ret; } +static ssize_t fuse_conn_writethrough_threshold_read(struct file *file, + char __user *buf, + size_t len, loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = READ_ONCE(fc->writethrough_threshold); + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_writethrough_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc; + char kbuf[32]; + unsigned long long val; + char *end; + + if (*ppos) + return -EINVAL; + if (count == 0 || count >= sizeof(kbuf)) + return -EINVAL; + if (copy_from_user(kbuf, buf, count)) + return -EFAULT; + kbuf[count] = '\0'; + + /* memparse accepts a bare suffix without a digit; require a digit */ + if (kbuf[0] < '0' || kbuf[0] > '9') + return -EINVAL; + + val = memparse(kbuf, &end); + end = skip_spaces(end); + if (*end) + return -EINVAL; + if (val > UINT_MAX) + return -EINVAL; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return -ENOENT; + + WRITE_ONCE(fc->writethrough_threshold, (unsigned int)val); + fuse_conn_put(fc); + + return count; +} + +static const struct file_operations fuse_conn_writethrough_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_writethrough_threshold_read, + .write = fuse_conn_writethrough_threshold_write, + .llseek = no_llseek, +}; + static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, @@ -278,7 +340,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) 1, NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", S_IFREG | 0600, 1, NULL, - &fuse_conn_congestion_threshold_ops)) + &fuse_conn_congestion_threshold_ops) || + !fuse_ctl_add_dentry(parent, fc, "writethrough_threshold", + S_IFREG | 0600, 1, NULL, + &fuse_conn_writethrough_threshold_ops)) goto err; return 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3423db2ffe7e76..b782778eecccd4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1483,6 +1483,7 @@ static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb, struct iov_iter *from) { size_t count = iov_iter_count(from); + unsigned int wt; u64 align; bool ret; @@ -1493,6 +1494,10 @@ static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb, if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT)) return true; + wt = READ_ONCE(fc->writethrough_threshold); + if (wt && count >= wt) + return false; + align = fc->alignment_pages ? (u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 72aabf8dd5ff31..146ebf7243be23 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,7 +47,7 @@ #define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ -#define FUSE_CTL_NUM_DENTRIES 5 +#define FUSE_CTL_NUM_DENTRIES 6 /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; @@ -952,6 +952,9 @@ struct fuse_conn { /* The foffset alignment in PAGE */ unsigned int alignment_pages; + /* Buffered writes >= this size bypass the writeback cache (0 = off) */ + unsigned int writethrough_threshold; + }; /*