diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 55ab5e9cf61715..3d60770a1cc3bc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1464,6 +1464,50 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
 	}
 }
 
+
+/*
+ * With writeback caching the request size seen by the server depends on
+ * how many contiguous dirty pages the flusher finds, which is bounded by
+ * dirty throttling: with BDI_CAP_STRICTLIMIT the dirty window can degrade
+ * to a single page under streaming writes, turning large application
+ * writes into page-sized requests.
+ *
+ * Writes that already match the server's preferred alignment gain
+ * nothing from accumulating in the page cache, so send them through
+ * fuse_perform_write() instead, which packs requests up to max_write.
+ * They create no dirty pages, hence no DLM write lock needs to be cached
+ * for them.  Unaligned writes keep using the writeback cache, where they
+ * can merge with neighbouring data.
+ */
+static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
+				     struct iov_iter *from)
+{
+	size_t count = iov_iter_count(from);
+	u64 align;
+	bool ret;
+
+	if (!fc->big_writes) {
+		printk("%s: wbc=1 no big_writes pos=%lld count=%zu\n",
+		       __func__, iocb->ki_pos, count);
+		return true;
+	}
+
+	/* these rely on the semantics of their current paths */
+	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT)) {
+		printk("%s: wbc=1 ki_flags=0x%x pos=%lld count=%zu\n",
+		       __func__, iocb->ki_flags, iocb->ki_pos, count);
+		return true;
+	}
+
+	align = fc->alignment_pages ?
+		(u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE;
+
+	ret = !IS_ALIGNED(iocb->ki_pos | (u64)count, align);
+	printk("%s: wbc=%d pos=%lld count=%zu align=%llu alignment_pages=%u\n",
+	       __func__, ret, iocb->ki_pos, count, align, fc->alignment_pages);
+	return ret;
+}
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -1486,6 +1530,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			goto writethrough;
 		}
 
+		if (!fuse_use_writeback_cache(fc, iocb, from))
+			goto writethrough;
+
 		/* if we have dlm support acquire the lock for the area
 		 * we are writing into */
 		if (fc->dlm) {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a2d496160da4c5..259daca77475fe 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1745,7 +1745,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 
 	/* fuse does it's own writeback accounting */
 	sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
-	sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+	sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
 
 	/*
 	 * For a single fuse filesystem use max 1% of dirty +
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index d472df6370a400..e2b2c68b162d43 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -429,8 +429,8 @@ struct fuse_file_lock {
  * FUSE_OVER_IO_URING: Indicate that client supports io-uring
  * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation
  * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation
- * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for
- *			optimal io-size alignment
+ * FUSE_ALIGN_PG_ORDER: alignment order (power of 2 exponent of the IO size
+ *			in bytes) for optimal io-size alignment
  * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free
  *			 to register between 1 and nr-core io-uring queues
  */
@@ -926,7 +926,8 @@ struct fuse_init_in {
 #define FUSE_COMPAT_22_INIT_OUT_SIZE 24
 
 /*
- * align_page_order: Number of pages for optimal IO, or a multiple of that
+ * align_page_order: log2 of the optimal IO size in bytes; IO is optimal
+ * when sized and aligned to (1 << align_page_order) or a multiple of it
  */
 struct fuse_init_out {
 	uint32_t	major;