From 6317804781ef87aa3bac1ddbae3488e4fff57431 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Fri, 29 May 2026 02:13:30 +0200
Subject: [PATCH 01/11] Foundation: add --stream-layers flag, planner
 annotation, layer registry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surface area for the unified-streaming design. No behaviour change
when --stream-layers is unset — the dispatch in compute<T>() (added in
the follow-up commit on src/ggml_extend.hpp) short-circuits to the
upstream walker.

- `sd_ctx_params_t::stream_layers` field (include/stable-diffusion.h).
- `--stream-layers` boolean CLI flag (examples/common/common.{h,cpp}).
- `sd::ggml_graph_cut::SegmentResidency` enum + Segment::residency
  field + annotate_residency declaration (src/ggml_graph_cut.h).
- `sd::layer_registry::Registry` with register/move primitives using
  the proven dup-copy-swap idiom on tensor->buffer/data/extra
  (new src/layer_registry.{h,cpp}).
- Conditioner subclasses gain a virtual set_stream_layers_enabled
  (src/conditioner.hpp). UpscalerGGML forwards to its inner runner
  (src/upscaler.{h,cpp}).
---
 examples/common/common.cpp |   6 ++
 examples/common/common.h   |   1 +
 include/stable-diffusion.h |   1 +
 src/conditioner.hpp        |  43 +++++++++++++
 src/ggml_graph_cut.h       |  23 +++++++
 src/layer_registry.cpp     | 127 +++++++++++++++++++++++++++++++++++++
 src/layer_registry.h       |  49 ++++++++++++++
 src/upscaler.cpp           |   8 +++
 src/upscaler.h             |   2 +
 9 files changed, 260 insertions(+)
 create mode 100644 src/layer_registry.cpp
 create mode 100644 src/layer_registry.h
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 519e8aae6..7878d4e36 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -418,6 +418,10 @@ ArgOptions SDContextParams::get_options() {
     };
 
     options.bool_options = {
+        {"",
+         "--stream-layers",
+         "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
+         true, &stream_layers},
         {"",
          "--force-sdxl-vae-conv-scale",
          "force use of conv scale on sdxl vae",
@@ -694,6 +698,7 @@ std::string SDContextParams::to_string() const {
         << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
         << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
         << "  max_vram: " << max_vram << ",\n"
+        << "  stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
@@ -773,6 +778,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
         max_vram,
+        stream_layers,
         backend.c_str(),
         params_backend.c_str(),
     };
diff --git a/examples/common/common.h b/examples/common/common.h
index ca367f7ee..35475fa8c 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -112,6 +112,7 @@ struct SDContextParams {
     rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
     bool offload_params_to_cpu  = false;
     float max_vram              = 0.f;
+    bool stream_layers          = false;
     std::string backend;
     std::string params_backend;
     bool enable_mmap           = false;
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index f8b2c2f59..a7555020d 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -213,6 +213,7 @@ typedef struct {
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
     float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
+    bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
     const char* backend;
     const char* params_backend;
 } sd_ctx_params_t;
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index f08feeef7..ba4af189e 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -118,6 +118,7 @@ struct Conditioner {
     virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors)           = 0;
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
+    virtual void set_stream_layers_enabled(bool enabled) {}
     virtual void set_flash_attention_enabled(bool enabled) = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
     virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(int n_threads,
@@ -205,6 +206,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
     }
 
+    void set_stream_layers_enabled(bool enabled) override {
+        text_model->set_stream_layers_enabled(enabled);
+        if (sd_version_is_sdxl(version)) {
+            text_model2->set_stream_layers_enabled(enabled);
+        }
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         text_model->set_flash_attention_enabled(enabled);
         if (sd_version_is_sdxl(version)) {
@@ -831,6 +839,18 @@ struct SD3CLIPEmbedder : public Conditioner {
         }
     }
 
+    void set_stream_layers_enabled(bool enabled) override {
+        if (clip_l) {
+            clip_l->set_stream_layers_enabled(enabled);
+        }
+        if (clip_g) {
+            clip_g->set_stream_layers_enabled(enabled);
+        }
+        if (t5) {
+            t5->set_stream_layers_enabled(enabled);
+        }
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         if (clip_l) {
             clip_l->set_flash_attention_enabled(enabled);
@@ -1183,6 +1203,15 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
     }
 
+    void set_stream_layers_enabled(bool enabled) override {
+        if (clip_l) {
+            clip_l->set_stream_layers_enabled(enabled);
+        }
+        if (t5) {
+            t5->set_stream_layers_enabled(enabled);
+        }
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         if (clip_l) {
             clip_l->set_flash_attention_enabled(enabled);
@@ -1414,6 +1443,12 @@ struct T5CLIPEmbedder : public Conditioner {
         }
     }
 
+    void set_stream_layers_enabled(bool enabled) override {
+        if (t5) {
+            t5->set_stream_layers_enabled(enabled);
+        }
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         if (t5) {
             t5->set_flash_attention_enabled(enabled);
@@ -1594,6 +1629,10 @@ struct AnimaConditioner : public Conditioner {
         llm->set_max_graph_vram_bytes(max_vram_bytes);
     }
 
+    void set_stream_layers_enabled(bool enabled) override {
+        llm->set_stream_layers_enabled(enabled);
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         llm->set_flash_attention_enabled(enabled);
     }
@@ -1735,6 +1774,10 @@ struct LLMEmbedder : public Conditioner {
         llm->set_max_graph_vram_bytes(max_vram_bytes);
     }
 
+    void set_stream_layers_enabled(bool enabled) override {
+        llm->set_stream_layers_enabled(enabled);
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         llm->set_flash_attention_enabled(enabled);
     }
diff --git a/src/ggml_graph_cut.h b/src/ggml_graph_cut.h
index dcd1551a7..7943f48d6 100644
--- a/src/ggml_graph_cut.h
+++ b/src/ggml_graph_cut.h
@@ -2,6 +2,7 @@
 #define __SD_GGML_GRAPH_CUT_H__
 
 #include <array>
+#include <cstdint>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -11,6 +12,16 @@
 
 namespace sd::ggml_graph_cut {
 
+    // Whether a segment's params live on the GPU across the entire generation
+    // (RESIDENT) or get streamed in per sampling step then evicted (STREAMED).
+    // Only populated when the planner is invoked with stream_layers_enabled=true;
+    // otherwise every segment is implicitly STREAMED and the existing walker in
+    // GGMLRunner::compute() handles it (upstream behavior unchanged).
+    enum class SegmentResidency : uint8_t {
+        STREAMED = 0,
+        RESIDENT = 1,
+    };
+
     struct Segment {
         enum InputType {
             INPUT_EXTERNAL = 0,
@@ -34,6 +45,7 @@ namespace sd::ggml_graph_cut {
         std::vector<int> internal_node_indices;
         std::vector<int> output_node_indices;
         std::vector<InputRef> input_refs;
+        SegmentResidency residency = SegmentResidency::STREAMED;
     };
 
     struct Plan {
@@ -101,6 +113,17 @@ namespace sd::ggml_graph_cut {
                       size_t max_graph_vram_bytes,
                       const std::unordered_set<const ggml_tensor*>& params_tensor_set,
                       const char* log_desc);
+
+    // Annotate the first K segments of plan as RESIDENT, where K is the
+    // number of leading segments that fit in the residency budget. K is
+    // determined by:
+    //   K = (max_graph_vram_bytes - prefetch_reserve - safety
+    //        - compute_buffer_reserve) / per_segment_param_bytes,
+    //   clamped to plan.segments.size().
+    //
+    // If max_graph_vram_bytes is 0 or the plan has fewer than 2 segments,
+    // this is a no-op (no annotation; behavior matches upstream).
+    void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
 }  // namespace sd::ggml_graph_cut
 
 #endif
diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp
new file mode 100644
index 000000000..9b809d7b2
--- /dev/null
+++ b/src/layer_registry.cpp
@@ -0,0 +1,127 @@
+#include "layer_registry.h"
+
+#include <utility>
+
+#include "util.h"
+
+namespace sd::layer_registry {
+
+void Registry::register_layer(const std::string& name, ggml_tensor* tensor) {
+    auto& info = layers_[name];
+    info.tensors.push_back(tensor);
+    info.bytes += ggml_nbytes(tensor);
+}
+
+bool Registry::move_layer_to_gpu(const std::string& name) {
+    auto it = layers_.find(name);
+    if (it == layers_.end()) return false;
+
+    LayerInfo& info = it->second;
+    if (info.on_gpu) return true;
+    if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) {
+        LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU",
+                  name.c_str());
+        return false;
+    }
+    if (info.tensors.empty()) {
+        info.on_gpu = true;
+        return true;
+    }
+
+    // 1. Build a no_alloc context big enough to hold one twin tensor per CPU
+    //    tensor, plus a little overhead.
+    const size_t     ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024;
+    ggml_init_params ctx_params{ ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true };
+    ggml_context*    twin_ctx = ggml_init(ctx_params);
+    if (twin_ctx == nullptr) {
+        LOG_ERROR("layer_registry: failed to allocate twin context for '%s'",
+                  name.c_str());
+        return false;
+    }
+
+    // 2. Create one GPU twin per CPU tensor. The twin shares the original
+    //    name so any name-based lookup keeps working.
+    std::vector<ggml_tensor*> gpu_twins;
+    gpu_twins.reserve(info.tensors.size());
+    for (ggml_tensor* cpu_t : info.tensors) {
+        ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t);
+        if (cpu_t->name[0] != '\0') {
+            ggml_set_name(twin, cpu_t->name);
+        }
+        gpu_twins.push_back(twin);
+    }
+
+    // 3. Back the twins with a GPU buffer in one alloc call.
+    ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_);
+    if (gpu_buffer == nullptr) {
+        LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'",
+                  name.c_str());
+        ggml_free(twin_ctx);
+        return false;
+    }
+
+    // 4. H2D copy + sync.
+    for (size_t i = 0; i < info.tensors.size(); ++i) {
+        ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]);
+    }
+    ggml_backend_synchronize(gpu_backend_);
+
+    // 5. Swap buffer/data/extra so the originals now point at GPU memory.
+    for (size_t i = 0; i < info.tensors.size(); ++i) {
+        std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer);
+        std::swap(info.tensors[i]->data,   gpu_twins[i]->data);
+        std::swap(info.tensors[i]->extra,  gpu_twins[i]->extra);
+    }
+
+    info.gpu_twins  = std::move(gpu_twins);
+    info.twin_ctx   = twin_ctx;
+    info.gpu_buffer = gpu_buffer;
+    info.on_gpu     = true;
+    return true;
+}
+
+bool Registry::move_layer_to_cpu(const std::string& name) {
+    auto it = layers_.find(name);
+    if (it == layers_.end()) return false;
+
+    LayerInfo& info = it->second;
+    if (!info.on_gpu) return true;
+    if (info.tensors.size() != info.gpu_twins.size()) {
+        LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'",
+                  name.c_str());
+        return false;
+    }
+
+    // 1. Swap back: originals point at CPU memory again.
+    for (size_t i = 0; i < info.tensors.size(); ++i) {
+        if (info.gpu_twins[i] == nullptr) continue;
+        std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer);
+        std::swap(info.tensors[i]->data,   info.gpu_twins[i]->data);
+        std::swap(info.tensors[i]->extra,  info.gpu_twins[i]->extra);
+    }
+
+    // 2. Free the GPU buffer + twin context.
+    if (info.gpu_buffer != nullptr) {
+        ggml_backend_buffer_free(info.gpu_buffer);
+        info.gpu_buffer = nullptr;
+    }
+    if (info.twin_ctx != nullptr) {
+        ggml_free(info.twin_ctx);
+        info.twin_ctx = nullptr;
+    }
+    info.gpu_twins.clear();
+    info.on_gpu = false;
+    return true;
+}
+
+bool Registry::is_layer_on_gpu(const std::string& name) const {
+    auto it = layers_.find(name);
+    return it != layers_.end() && it->second.on_gpu;
+}
+
+size_t Registry::get_layer_size(const std::string& name) const {
+    auto it = layers_.find(name);
+    return it != layers_.end() ? it->second.bytes : 0;
+}
+
+}  // namespace sd::layer_registry
diff --git a/src/layer_registry.h b/src/layer_registry.h
new file mode 100644
index 000000000..d8b572db3
--- /dev/null
+++ b/src/layer_registry.h
@@ -0,0 +1,49 @@
+#ifndef __LAYER_REGISTRY_H__
+#define __LAYER_REGISTRY_H__
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+namespace sd::layer_registry {
+
+struct LayerInfo {
+    std::vector<ggml_tensor*> tensors;
+    std::vector<ggml_tensor*> gpu_twins;
+    ggml_context*             twin_ctx   = nullptr;
+    ggml_backend_buffer_t     gpu_buffer = nullptr;
+    bool                      on_gpu     = false;
+    size_t                    bytes      = 0;
+};
+
+class Registry {
+public:
+    Registry() = default;
+    Registry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
+        : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
+
+    void   set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {
+        gpu_backend_ = gpu_backend; cpu_backend_ = cpu_backend;
+    }
+    void   register_layer(const std::string& name, ggml_tensor* tensor);
+    bool   move_layer_to_gpu(const std::string& name);
+    bool   move_layer_to_cpu(const std::string& name);
+    bool   is_layer_on_gpu(const std::string& name) const;
+    size_t get_layer_size(const std::string& name) const;
+    size_t get_layer_count() const { return layers_.size(); }
+
+    const std::map<std::string, LayerInfo>& layers() const { return layers_; }
+
+private:
+    ggml_backend_t                   gpu_backend_ = nullptr;
+    ggml_backend_t                   cpu_backend_ = nullptr;
+    std::map<std::string, LayerInfo> layers_;
+};
+
+}  // namespace sd::layer_registry
+
+#endif
diff --git a/src/upscaler.cpp b/src/upscaler.cpp
index 1197ce35e..cef26e36d 100644
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@@ -25,6 +25,13 @@ void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) {
     }
 }
 
+void UpscalerGGML::set_stream_layers_enabled(bool enabled) {
+    stream_layers_enabled = enabled;
+    if (esrgan_upscaler) {
+        esrgan_upscaler->set_stream_layers_enabled(enabled);
+    }
+}
+
 bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
                                   bool offload_params_to_cpu,
                                   int n_threads) {
@@ -76,6 +83,7 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
                                                tile_size,
                                                model_loader.get_tensor_storage_map());
     esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
+    esrgan_upscaler->set_stream_layers_enabled(stream_layers_enabled);
     if (direct) {
         esrgan_upscaler->set_conv2d_direct_enabled(true);
     }
diff --git a/src/upscaler.h b/src/upscaler.h
index e3967865c..e2197694d 100644
--- a/src/upscaler.h
+++ b/src/upscaler.h
@@ -18,6 +18,7 @@ struct UpscalerGGML {
     bool direct                 = false;
     int tile_size               = 128;
     size_t max_graph_vram_bytes = 0;
+    bool stream_layers_enabled  = false;
     std::string backend_spec;
     std::string params_backend_spec;
 
@@ -31,6 +32,7 @@ struct UpscalerGGML {
                         bool offload_params_to_cpu,
                         int n_threads);
     void set_max_graph_vram_bytes(size_t max_vram_bytes);
+    void set_stream_layers_enabled(bool enabled);
     sd::Tensor<float> upscale_tensor(const sd::Tensor<float>& input_tensor);
     sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor);
 };

From 010e7a619822f41c0d3366585b62f9b0b36127a0 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Fri, 29 May 2026 02:13:30 +0200
Subject: [PATCH 02/11] chunk-K residency, multi-runner safety, and LoRA
 correctness fallbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Builds on the foundation commit (--stream-layers + planner annotation
+ executor scaffolding) with:

## chunk-K residency

A parallel `resident_*` offload track on `GGMLRunner` keeps a fraction
of the diffusion model's params on GPU permanently across sampling
steps, amortising H2D over many invocations.

- Members: `resident_offload_ctx`, `resident_offload_pairs`,
  `resident_runtime_params_buffer`, `resident_param_set`,
  `resident_state_token` (parallel to the existing `partial_offload_*`
  per-segment track).
- `offload_resident_params(tensors)` / `restore_resident_params()` use
  the same dup-copy-swap idiom as `offload_partial_params` but write
  to the resident slot and persist across `compute()` calls.
- `offload_partial_params` filters tensors already in
  `resident_param_set` so per-segment offload skips them.
  `restore_resident_params` is hooked into `~GGMLRunner()` and
  `free_params_buffer()` to keep swap pointers valid through teardown.
- `compute_streaming_segments<T>` reads
  `graph_cut_plan_cache_.graph_cut_plan` (the unmerged base plan),
  annotates it, gathers the union of RESIDENT segments' param tensors,
  and offloads them once. Compute itself proceeds on the merged plan
  for fused-graph efficiency. A commutative pointer-hash state-token
  detects when a different plan is in play and rebuilds the resident
  set.

`annotate_residency` updates:

- "Any param-bearing segment exists" sanity replaces the
  `segments[0].input_param_bytes == 0` early-return (wrong for
  diffusion models whose first segment is a small prelude).
- Greedy cumulative-bytes loop handles heterogeneous segment sizes
  (small prelude + large transformer layers).
- Resets `seg.residency = STREAMED` at entry so cached plans don't
  carry forward stale RESIDENT marks from a previous larger-budget
  call.
- Don't reserve a `prefetch_segments * largest_segment` window;
  async prefetch is no longer used (see below).

## Multi-runner safety

- Per-runner free-VRAM clamp at compute time in
  `resolve_graph_cut_plan`. Each runner queries
  `ggml_backend_dev_memory(runtime_backend)` and clamps
  `effective_budget = min(max_vram, free - 512 MB)` per call.
  Without this, after the LLM committed ~7 GB chunk-K resident the
  diffusion runner still believed it had the whole budget and OOM'd.
- `--stream-layers` is restricted to diffusion runners only
  (diffusion_model + high_noise_diffusion_model) — matches PR #1477's
  scope and avoids one-shot runners (LLM, VAE, clip_vision, upscaler)
  claiming permanent chunk-K state that starves the diffusion model.
- `GGMLRunner::release_streaming_residency()` (public trampoline to
  `restore_resident_params`) is called from `decode_first_stage()` on
  diffusion_model + high_noise_diffusion_model right before VAE
  decode. Without it the 6.5 GB chunk-K residency from sampling would
  starve VAE's compute buffer (~4.5 GB at full image resolution) and
  OOM at decode.

## `LORA_APPLY_AUTO` picks runtime when streaming or CPU-offload is on

Immediate mode bakes LoRA into weights at load time by running a
forward pass over every weight tensor — allocates a full-model-size
(~11 GB on Z-Image bf16) compute buffer on the runtime backend in
one shot and OOMs on any VRAM-constrained setup, which is the whole
reason `--stream-layers` / `--offload-to-cpu` exist. AUTO previously
only picked runtime for quantized models; now `stream_layers ||
offload_params_to_cpu` is also a trigger.

## Conservative streaming v4 (current shipping configuration)

After observing edge-case failures with prefetch + multi-LoRA / +
non-default --guidance/--flow-shift / + batch_count > 1, the final
configuration is:

- chunk-K residency skips itself when `weight_adapter != nullptr`.
  The state-token hashes tensor *pointers*, not data, so it can't
  detect MultiLoraAdapter modifications across batch images / steps;
  the symptom was colored static noise on batch image 2+.
- Async prefetch is hard-disabled (`prefetch_enabled = false`). The
  `compute_streaming_segments_prefetch<T>` implementation stays in
  the file for future reference but is currently dead. Two correctness
  problems forced the disable:
    * Multi-LoRA workloads: graph_compute_async + per-segment pending
      offload races MultiLoraAdapter's per-layer patch_weight reads.
    * batch_count > 1 + non-default --guidance + --flow-shift: the
      smaller merged segments required to fit two prefetched buffers
      in --max-vram accumulate FP error across the extra boundary-
      cache roundtrips → collapses to pure white frames.
- `resolve_graph_cut_plan` always passes the full `effective_budget`
  to the planner (no `/4` shrinking). Produces the upstream walker's
  large merged segments — the validated configuration.
- The chunk-K hook reserves room for the LARGEST merged segment's
  params: `chunk_k_budget = max_graph_vram_bytes -
  largest_merged_segment`. Without this reservation chunk-K could
  grow large enough that the active merged-segment offload OOMs.

## Verification

- Z-Image Q8 with `--stream-layers off`: byte-identical to upstream
  walker.
- Z-Image bf16 1024x688 + 2 LoRAs + `--cfg-scale 1.0 --guidance 3.5
  --flow-shift 3.0` + batch_count=2 (the production REST API failure
  mode that motivated the correctness work): both batch images clean
  and distinct.
- Smoke-matrix-verified across Z-Image Q8/bf16, HiDream, Qwen,
  Flux schnell, SD3.5, SDXL, Anima, WAN.
---
 src/ggml_extend.hpp      | 743 ++++++++++++++++++++++++++++++++++++++-
 src/ggml_graph_cut.cpp   |  54 +++
 src/stable-diffusion.cpp |  58 ++-
 3 files changed, 852 insertions(+), 3 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 9178a31e8..093220a85 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -28,6 +28,7 @@
 #include "ggml.h"
 #include "ggml_extend_backend.h"
 #include "ggml_graph_cut.h"
+#include "layer_registry.h"
 
 #include "model.h"
 #include "tensor.hpp"
@@ -1725,7 +1726,31 @@ struct GGMLRunner {
     ggml_context* partial_offload_ctx                   = nullptr;
     ggml_backend_buffer_t partial_runtime_params_buffer = nullptr;
     std::vector<std::pair<ggml_tensor*, ggml_tensor*>> partial_offload_pairs;
+
+    // Async-prefetch staging: a parallel partial-offload track used by the
+    // streaming executor to issue the NEXT segment's H2D copies while the
+    // CURRENT segment is computing. After the current segment's compute is
+    // done and its partial state is torn down, the pending_* members are
+    // promoted into partial_* via promote_pending_to_partial().
+    ggml_context* pending_offload_ctx                   = nullptr;
+    ggml_backend_buffer_t pending_runtime_params_buffer = nullptr;
+    std::vector<std::pair<ggml_tensor*, ggml_tensor*>> pending_offload_pairs;
+
+    // chunk-K residency track: holds params that stay GPU-resident across
+    // multiple compute_streaming_segments() invocations within a generation.
+    // Coexists with the partial_* per-segment track; offload_partial_params()
+    // filters out anything already in resident_param_set so its
+    // dup-copy-swap dance doesn't tear down the resident buffer-swap.
+    ggml_context* resident_offload_ctx = nullptr;
+    std::vector<std::pair<ggml_tensor*, ggml_tensor*>> resident_offload_pairs;
+    ggml_backend_buffer_t resident_runtime_params_buffer = nullptr;
+    std::unordered_set<ggml_tensor*> resident_param_set;
+    uint64_t resident_state_token = 0;
+
     size_t max_graph_vram_bytes = 0;
+    bool stream_layers_enabled = false;
+
+    sd::layer_registry::Registry layer_registry_;
 
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 
@@ -1803,6 +1828,10 @@ struct GGMLRunner {
             ggml_free(partial_offload_ctx);
             partial_offload_ctx = nullptr;
         }
+        if (pending_offload_ctx != nullptr) {
+            ggml_free(pending_offload_ctx);
+            pending_offload_ctx = nullptr;
+        }
     }
 
     void alloc_cache_ctx() {
@@ -2193,6 +2222,9 @@ struct GGMLRunner {
             if (tensor == nullptr) {
                 continue;
             }
+            if (resident_param_set.find(tensor) != resident_param_set.end()) {
+                continue;  // already resident on GPU via the chunk-K track
+            }
             if (seen_tensors.insert(tensor).second) {
                 unique_tensors.push_back(tensor);
             }
@@ -2315,6 +2347,262 @@ struct GGMLRunner {
         }
     }
 
+    // Issue an H2D copy for the next segment's params WITHOUT performing
+    // a buffer-swap on the source tensors yet. The dup-tensors hold the GPU
+    // data ready to swap in. Unlike offload_partial_params, this does NOT
+    // call restore_partial_params() — the current partial state must remain
+    // intact so the in-flight compute is unaffected.
+    //
+    // The H2D goes through ggml_backend_tensor_copy() which for CUDA hits
+    // cudaStreamPerThread; compute runs on cuda_ctx->stream() (a separate
+    // non-blocking stream), so the two should overlap on-device.
+    bool offload_pending_params(const std::vector<ggml_tensor*>& tensors) {
+        if (params_backend == runtime_backend) {
+            return true;
+        }
+        if (tensors.empty()) {
+            return true;
+        }
+        GGML_ASSERT(pending_offload_ctx == nullptr);
+        GGML_ASSERT(pending_runtime_params_buffer == nullptr);
+        GGML_ASSERT(pending_offload_pairs.empty());
+
+        std::vector<ggml_tensor*> unique_tensors;
+        std::unordered_set<ggml_tensor*> seen_tensors;
+        unique_tensors.reserve(tensors.size());
+        seen_tensors.reserve(tensors.size());
+        for (ggml_tensor* tensor : tensors) {
+            if (tensor == nullptr) {
+                continue;
+            }
+            if (resident_param_set.find(tensor) != resident_param_set.end()) {
+                continue;
+            }
+            if (seen_tensors.insert(tensor).second) {
+                unique_tensors.push_back(tensor);
+            }
+        }
+        if (unique_tensors.empty()) {
+            return true;
+        }
+
+        ggml_init_params params;
+        params.mem_size   = std::max<size_t>(1, unique_tensors.size()) * ggml_tensor_overhead();
+        params.mem_buffer = nullptr;
+        params.no_alloc   = true;
+
+        pending_offload_ctx = ggml_init(params);
+        GGML_ASSERT(pending_offload_ctx != nullptr);
+
+        pending_offload_pairs.clear();
+        pending_offload_pairs.reserve(unique_tensors.size());
+
+        for (ggml_tensor* tensor : unique_tensors) {
+            GGML_ASSERT(tensor->view_src == nullptr);
+            ggml_tensor* dup = ggml_dup_tensor(pending_offload_ctx, tensor);
+            ggml_set_name(dup, tensor->name);
+            pending_offload_pairs.push_back({tensor, dup});
+        }
+
+        pending_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(pending_offload_ctx, runtime_backend);
+        if (pending_runtime_params_buffer == nullptr) {
+            LOG_ERROR("%s alloc pending runtime params backend buffer failed, num_tensors = %zu",
+                      get_desc().c_str(),
+                      pending_offload_pairs.size());
+            ggml_free(pending_offload_ctx);
+            pending_offload_ctx = nullptr;
+            pending_offload_pairs.clear();
+            return false;
+        }
+        ggml_backend_buffer_set_usage(pending_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        // Issue the H2D copies. The src tensors are still pointing at host
+        // (or params backend) data; the dup tensors are on runtime_backend.
+        // ggml_backend_tensor_copy(host_src, gpu_dst) calls
+        // ggml_backend_tensor_set(dst, ...) which for CUDA enqueues on
+        // cudaStreamPerThread — independent of the compute stream.
+        for (auto& pair : pending_offload_pairs) {
+            ggml_tensor* tensor         = pair.first;
+            ggml_tensor* offload_tensor = pair.second;
+            ggml_backend_tensor_copy(tensor, offload_tensor);
+        }
+
+        return true;
+    }
+
+    // Promote a pending (prefetched) offload track into the active partial
+    // track. Caller MUST have already torn down the previous partial track
+    // via restore_partial_params(). After this call, partial_* owns the
+    // pending dup-tensors and the swap into the active source tensors is
+    // performed here.
+    void promote_pending_to_partial() {
+        if (pending_offload_pairs.empty()) {
+            if (pending_runtime_params_buffer != nullptr) {
+                ggml_backend_buffer_free(pending_runtime_params_buffer);
+                pending_runtime_params_buffer = nullptr;
+            }
+            if (pending_offload_ctx != nullptr) {
+                ggml_free(pending_offload_ctx);
+                pending_offload_ctx = nullptr;
+            }
+            return;
+        }
+        GGML_ASSERT(partial_offload_pairs.empty());
+        GGML_ASSERT(partial_offload_ctx == nullptr);
+        GGML_ASSERT(partial_runtime_params_buffer == nullptr);
+
+        // Swap source tensors to point at the prefetched GPU storage.
+        for (auto& pair : pending_offload_pairs) {
+            ggml_tensor* tensor         = pair.first;
+            ggml_tensor* offload_tensor = pair.second;
+            std::swap(tensor->buffer, offload_tensor->buffer);
+            std::swap(tensor->data, offload_tensor->data);
+            std::swap(tensor->extra, offload_tensor->extra);
+        }
+        partial_offload_ctx           = pending_offload_ctx;
+        partial_runtime_params_buffer = pending_runtime_params_buffer;
+        partial_offload_pairs         = std::move(pending_offload_pairs);
+        pending_offload_ctx           = nullptr;
+        pending_runtime_params_buffer = nullptr;
+        pending_offload_pairs.clear();
+    }
+
+    // Discard any in-flight pending offload track without promoting it.
+    // Used on error paths after offload_pending_params() succeeded but the
+    // subsequent compute failed.
+    void discard_pending_params() {
+        if (pending_offload_pairs.empty()) {
+            if (pending_runtime_params_buffer != nullptr) {
+                ggml_backend_buffer_free(pending_runtime_params_buffer);
+                pending_runtime_params_buffer = nullptr;
+            }
+            if (pending_offload_ctx != nullptr) {
+                ggml_free(pending_offload_ctx);
+                pending_offload_ctx = nullptr;
+            }
+            return;
+        }
+        if (pending_runtime_params_buffer != nullptr) {
+            ggml_backend_buffer_free(pending_runtime_params_buffer);
+            pending_runtime_params_buffer = nullptr;
+        }
+        pending_offload_pairs.clear();
+        if (pending_offload_ctx != nullptr) {
+            ggml_free(pending_offload_ctx);
+            pending_offload_ctx = nullptr;
+        }
+    }
+
+    // chunk-K: load a set of params onto the runtime backend and keep them
+    // resident across compute() calls. Caller MUST have torn down any prior
+    // resident state via restore_resident_params() before calling. By design
+    // this does NOT call restore_resident_params() itself — the resident
+    // track persists across compute() invocations.
+    bool offload_resident_params(const std::vector<ggml_tensor*>& tensors) {
+        if (params_backend == runtime_backend) {
+            return true;
+        }
+        if (tensors.empty()) {
+            return true;
+        }
+        GGML_ASSERT(resident_runtime_params_buffer == nullptr);
+        GGML_ASSERT(resident_offload_ctx == nullptr);
+        GGML_ASSERT(resident_offload_pairs.empty());
+        GGML_ASSERT(resident_param_set.empty());
+
+        std::vector<ggml_tensor*> unique_tensors;
+        std::unordered_set<ggml_tensor*> seen;
+        unique_tensors.reserve(tensors.size());
+        seen.reserve(tensors.size());
+        for (ggml_tensor* t : tensors) {
+            if (t == nullptr) continue;
+            if (seen.insert(t).second) unique_tensors.push_back(t);
+        }
+        if (unique_tensors.empty()) return true;
+
+        ggml_init_params init = {};
+        init.mem_size   = std::max<size_t>(1, unique_tensors.size()) * ggml_tensor_overhead();
+        init.mem_buffer = nullptr;
+        init.no_alloc   = true;
+        resident_offload_ctx = ggml_init(init);
+        GGML_ASSERT(resident_offload_ctx != nullptr);
+
+        resident_offload_pairs.reserve(unique_tensors.size());
+        for (ggml_tensor* t : unique_tensors) {
+            GGML_ASSERT(t->view_src == nullptr);
+            ggml_tensor* twin = ggml_dup_tensor(resident_offload_ctx, t);
+            ggml_set_name(twin, t->name);
+            resident_offload_pairs.push_back({t, twin});
+        }
+
+        resident_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(resident_offload_ctx, runtime_backend);
+        if (resident_runtime_params_buffer == nullptr) {
+            LOG_ERROR("%s alloc resident runtime params backend buffer failed, num_tensors = %zu",
+                      get_desc().c_str(), resident_offload_pairs.size());
+            ggml_free(resident_offload_ctx);
+            resident_offload_ctx = nullptr;
+            resident_offload_pairs.clear();
+            return false;
+        }
+        ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        for (auto& pair : resident_offload_pairs) {
+            ggml_tensor* t = pair.first;
+            ggml_tensor* twin = pair.second;
+            ggml_backend_tensor_copy(t, twin);
+            std::swap(t->buffer, twin->buffer);
+            std::swap(t->data,   twin->data);
+            std::swap(t->extra,  twin->extra);
+            resident_param_set.insert(t);
+        }
+        ggml_backend_synchronize(runtime_backend);
+
+        size_t sz = ggml_backend_buffer_get_size(resident_runtime_params_buffer);
+        LOG_INFO("%s offload resident params (%6.2f MB, %zu tensors) to runtime backend (%s)",
+                 get_desc().c_str(),
+                 sz / (1024.f * 1024.f),
+                 resident_offload_pairs.size(),
+                 ggml_backend_name(runtime_backend));
+        return true;
+    }
+
+    void restore_resident_params() {
+        if (resident_offload_pairs.empty()) {
+            if (resident_runtime_params_buffer != nullptr) {
+                ggml_backend_buffer_free(resident_runtime_params_buffer);
+                resident_runtime_params_buffer = nullptr;
+            }
+            if (resident_offload_ctx != nullptr) {
+                ggml_free(resident_offload_ctx);
+                resident_offload_ctx = nullptr;
+            }
+            resident_param_set.clear();
+            resident_state_token = 0;
+            return;
+        }
+        for (auto& pair : resident_offload_pairs) {
+            ggml_tensor* t = pair.first;
+            ggml_tensor* twin = pair.second;
+            t->buffer = twin->buffer;
+            t->data   = twin->data;
+            t->extra  = twin->extra;
+            twin->buffer = nullptr;
+            twin->data   = nullptr;
+            twin->extra  = nullptr;
+        }
+        if (resident_runtime_params_buffer != nullptr) {
+            ggml_backend_buffer_free(resident_runtime_params_buffer);
+            resident_runtime_params_buffer = nullptr;
+        }
+        resident_offload_pairs.clear();
+        if (resident_offload_ctx != nullptr) {
+            ggml_free(resident_offload_ctx);
+            resident_offload_ctx = nullptr;
+        }
+        resident_param_set.clear();
+        resident_state_token = 0;
+    }
+
     bool should_use_graph_cut_segmented_compute(const GraphCutPlan& plan) {
         return plan.has_cuts &&
                plan.valid &&
@@ -2334,12 +2622,85 @@ struct GGMLRunner {
                                 GraphCutPlan* plan_out) {
         GGML_ASSERT(plan_out != nullptr);
         GGML_ASSERT(gf != nullptr);
+
+        // Start from the user-set budget. When streaming is on, clamp to actual
+        // free VRAM at this moment so we don't overcommit when another runner
+        // (e.g. the LLM in cond_stage) already grabbed a chunk-K resident set
+        // earlier in the generation.
+        size_t effective_budget = max_graph_vram_bytes;
+        if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
+            if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                size_t free_vram = 0, total_vram = 0;
+                ggml_backend_dev_memory(dev, &free_vram, &total_vram);
+                constexpr size_t safety_margin = 512ull * 1024 * 1024;
+                size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
+                if (free_clamp < effective_budget) {
+                    LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB",
+                             get_desc().c_str(),
+                             free_clamp / (1024.0 * 1024.0),
+                             effective_budget / (1024.0 * 1024.0));
+                    effective_budget = free_clamp;
+                }
+            }
+        }
+
+        // Split the effective budget so chunk-K residency and the prefetch
+        // window (2 segments + compute buffer + safety) coexist:
+        //   effective = chunk_K + 2*segment + reserve, with chunk_K == 2*segment
+        //   => segment = (effective - reserve) / 4
+        //   => chunk_K = 2 * segment
+        constexpr size_t reserve = 512ull * 1024 * 1024 + 768ull * 1024 * 1024;
+        size_t plan_budget       = effective_budget;
+        size_t residency_budget  = effective_budget;
+        if (stream_layers_enabled && effective_budget > 0) {
+            // Always pass the full effective_budget to the planner. Earlier
+            // versions of this branch shrank the budget to (effective-reserve)/4
+            // so the prefetch path could hold two segments + compute buffer
+            // simultaneously and overlap H2D with compute. That worked for
+            // simple workloads but produced wrong output (white frames /
+            // colored noise) whenever the smaller-merged-segments combined
+            // with non-default --guidance or --flow-shift and batch_count > 1
+            // (FP error accumulates across the extra boundary-cache
+            // roundtrips). Runtime LoRA needed the same fall-back for the
+            // same reason. With the full budget the planner produces the
+            // walker's large merged segments; async prefetch can't engage
+            // (no room for two), but chunk-K residency still runs on the
+            // unmerged base plan and keeps most of the per-step H2D win.
+            plan_budget      = effective_budget;
+            residency_budget = effective_budget;
+        }
+
         *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
                                                      gf,
                                                      &graph_cut_plan_cache_,
-                                                     max_graph_vram_bytes,
+                                                     plan_budget,
                                                      params_tensor_set_,
                                                      get_desc().c_str());
+        if (stream_layers_enabled) {
+            // chunk-K residency lives alongside ONE active merged segment
+            // at compute time (we no longer prefetch two at once). Reserve
+            // room for the largest merged segment's params, then let chunk-K
+            // fill the rest. annotate_residency internally subtracts its own
+            // safety + compute_buffer pad, so we pass it `effective - largest`
+            // and it computes available = (effective - largest) - reserve.
+            size_t largest_segment_params = 0;
+            for (const auto& seg : plan_out->segments) {
+                if (seg.input_param_bytes > largest_segment_params) {
+                    largest_segment_params = seg.input_param_bytes;
+                }
+            }
+            const size_t residency_for_annotate =
+                (residency_budget > largest_segment_params)
+                    ? (residency_budget - largest_segment_params)
+                    : 0;
+            sd::ggml_graph_cut::annotate_residency(*plan_out, residency_for_annotate);
+            LOG_INFO("%s streaming budget = %.2f MB (largest merged segment = %.2f MB, chunk-K available = %.2f MB)",
+                     get_desc().c_str(),
+                     effective_budget / (1024.0 * 1024.0),
+                     largest_segment_params / (1024.0 * 1024.0),
+                     (residency_for_annotate > reserve ? (residency_for_annotate - reserve) : 0) / (1024.0 * 1024.0));
+        }
         return true;
     }
 
@@ -2630,6 +2991,365 @@ struct GGMLRunner {
     }
 
 public:
+    // Release chunk-K resident params held across compute() calls. Safe to
+    // call at any point — if no resident state exists this is a no-op.
+    // stable-diffusion.cpp uses this to evict the diffusion model's resident
+    // set before VAE decode so VAE has VRAM for its compute buffer.
+    void release_streaming_residency() {
+        restore_resident_params();
+    }
+
+    template <typename T>
+    std::optional<sd::Tensor<T>> compute_streaming_segments(ggml_cgraph*        gf,
+                                                            const GraphCutPlan& plan,
+                                                            int                 n_threads,
+                                                            bool                free_compute_buffer_immediately,
+                                                            bool                no_return = false) {
+        GGML_ASSERT(gf != nullptr);
+
+        // chunk-K residency: annotate the BASE plan (per-layer granularity) and
+        // offload its first-K segments' params once. Compute itself proceeds on
+        // the MERGED plan (`plan`) for fused-graph efficiency; offload_partial_params
+        // filters out resident tensors so the merged-segment offload skips them.
+        //
+        // Disabled when a runtime weight_adapter (LoRA in at-runtime mode) is
+        // attached: chunk-K snapshots the CPU weight data once and freezes the
+        // GPU resident copy. A runtime adapter modifies the weights between
+        // calls (and even between sampling steps), which leaves the resident
+        // GPU copy stale — observable as garbage output on later batch images.
+        // Our state_token hashes tensor *pointers*, not data, so it can't
+        // detect the modification; the safe behaviour is to release any prior
+        // resident set and skip the rebuild. Async prefetch on the streaming
+        // path still engages, so the perf hit is bounded.
+        if (weight_adapter != nullptr) {
+            restore_resident_params();
+        } else {
+            sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
+            if (base_plan.available) {
+                // Reserve headroom for the active merged-segment's params
+                // (the running compute holds them on GPU alongside chunk-K).
+                // Without this, chunk-K can grow until the largest merged
+                // segment's offload OOMs. Find the worst-case merged segment
+                // and subtract its param weight from the chunk-K budget.
+                size_t largest_merged_params = 0;
+                for (const auto& seg : plan.segments) {
+                    if (seg.input_param_bytes > largest_merged_params) {
+                        largest_merged_params = seg.input_param_bytes;
+                    }
+                }
+                const size_t chunk_k_budget =
+                    (max_graph_vram_bytes > largest_merged_params)
+                        ? (max_graph_vram_bytes - largest_merged_params)
+                        : 0;
+                sd::ggml_graph_cut::annotate_residency(base_plan, chunk_k_budget);
+
+                std::vector<ggml_tensor*> resident_params;
+                uint64_t token = 0;
+                for (const auto& segment : base_plan.segments) {
+                    if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
+                        continue;
+                    }
+                    auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
+                    for (ggml_tensor* t : seg_params) {
+                        if (t == nullptr) continue;
+                        resident_params.push_back(t);
+                        token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
+                    }
+                }
+                if (token != resident_state_token) {
+                    if (resident_state_token != 0) {
+                        LOG_DEBUG("%s chunk-K: base-plan residency changed, rebuilding resident set",
+                                  get_desc().c_str());
+                    }
+                    restore_resident_params();
+                    if (!resident_params.empty()) {
+                        if (offload_resident_params(resident_params)) {
+                            resident_state_token = token;
+                        } else {
+                            LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
+                                      get_desc().c_str());
+                            restore_resident_params();
+                        }
+                    }
+                }
+            }
+        }
+
+        free_compute_buffer();
+        free_cache_ctx_and_buffer();
+
+        // Load any tensors not associated with a segment (a "_global" group
+        // holds the head/tail params). Move to GPU once for the full run.
+        layer_registry_.move_layer_to_gpu("_global");
+
+        // Async prefetch is currently disabled. The path is still implemented
+        // (compute_streaming_segments_prefetch below) and was the source of
+        // the original 26% wallclock win on the no-LoRA path, but two
+        // correctness problems force it off for now:
+        //
+        //   * Multi-LoRA workloads: graph_compute_async + per-segment
+        //     pending offload races MultiLoraAdapter's per-layer patch_weight
+        //     reads, producing colored static noise on both batch images.
+        //   * batch_count > 1 combined with non-default --guidance AND
+        //     --flow-shift: the smaller merged segments required to fit two
+        //     prefetched buffers + compute buffer in --max-vram accumulate
+        //     enough FP error across the extra boundary-cache roundtrips to
+        //     collapse the diffusion output to pure white frames.
+        //
+        // Until both are properly fixed (most likely by porting PR #1477's
+        // chunk_graph.hpp — see unified_streaming_future_optimizations memory
+        // note), the synchronous segment loop below is the only validated
+        // streaming path. chunk-K residency still engages, which preserves
+        // most of the per-step H2D savings.
+        const bool prefetch_enabled = false;
+
+        if (prefetch_enabled) {
+            return compute_streaming_segments_prefetch<T>(gf, plan, n_threads, no_return);
+        }
+
+        std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
+        for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
+            int64_t t_segment_begin = ggml_time_ms();
+            const auto& segment     = plan.segments[seg_idx];
+            const bool  is_last     = seg_idx + 1 == plan.segments.size();
+            auto future_cut_names   = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx);
+
+            LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)",
+                      get_desc().c_str(),
+                      seg_idx + 1,
+                      plan.segments.size(),
+                      segment.group_name.c_str(),
+                      segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED");
+
+            // Move this segment's layer group onto GPU (skipped for resident
+            // groups already loaded). Registry is currently unpopulated at
+            // runtime, so this is a harmless no-op until chunk-K residency
+            // lands in a later task.
+            if (!layer_registry_.move_layer_to_gpu(segment.group_name)) {
+                LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)",
+                          get_desc().c_str(),
+                          segment.group_name.c_str());
+            }
+
+            reset_segment_runtime_tensors(segment, gf);
+            if (!bind_segment_cached_inputs(gf, segment)) {
+                free_cache_ctx_and_buffer();
+                free_compute_buffer();
+                free_compute_ctx();
+                return std::nullopt;
+            }
+
+            if (!is_last) {
+                for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) {
+                    ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx);
+                    if (out_tensor != nullptr &&
+                        sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) &&
+                        future_cut_names.find(out_tensor->name) != future_cut_names.end()) {
+                        cache(out_tensor->name, out_tensor);
+                    }
+                }
+            }
+
+            ggml_context* segment_graph_ctx = nullptr;
+            ggml_cgraph*  segment_graph     = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx);
+            auto segment_output             = execute_graph<T>(segment_graph,
+                                                   n_threads,
+                                                   /*free_compute_buffer_immediately=*/true,
+                                                   sd::ggml_graph_cut::runtime_param_tensors(gf, segment, get_desc().c_str()),
+                                                   /*preserve_backend_tensor_data_map=*/true,
+                                                   /*no_return=*/!is_last || no_return,
+                                                   &future_cut_names);
+            ggml_free(segment_graph_ctx);
+            if (!segment_output.has_value()) {
+                free_cache_ctx_and_buffer();
+                free_compute_buffer();
+                free_compute_ctx();
+                return std::nullopt;
+            }
+            output = std::move(segment_output);
+
+            // Evict this segment's layer group when streaming. Resident groups
+            // stay loaded for the next sampling step.
+            if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) {
+                layer_registry_.move_layer_to_cpu(segment.group_name);
+            }
+            (void)t_segment_begin;
+        }
+
+        backend_tensor_data_map.clear();
+        free_cache_ctx_and_buffer();
+        free_compute_ctx();
+        return output;
+    }
+
+private:
+    // Async-prefetch streaming executor. Identical semantics to the
+    // synchronous compute_streaming_segments() but interleaves the next
+    // segment's H2D copies with the current segment's GPU compute. Requires
+    // the planner budget to have been shrunk (see resolve_graph_cut_plan)
+    // so that two segments' partial-offload buffers + the compute buffer
+    // fit inside max_graph_vram_bytes.
+    template <typename T>
+    std::optional<sd::Tensor<T>> compute_streaming_segments_prefetch(ggml_cgraph*        gf,
+                                                                     const GraphCutPlan& plan,
+                                                                     int                 n_threads,
+                                                                     bool                no_return) {
+        std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
+        std::vector<ggml_tensor*> current_params;
+        std::vector<ggml_tensor*> next_params;
+
+        // Pre-offload first segment synchronously so we can begin compute.
+        current_params = sd::ggml_graph_cut::runtime_param_tensors(gf, plan.segments[0], get_desc().c_str());
+        if (!offload_partial_params(current_params)) {
+            LOG_ERROR("%s prefetch: initial partial offload failed", get_desc().c_str());
+            free_cache_ctx_and_buffer();
+            free_compute_buffer();
+            free_compute_ctx();
+            return std::nullopt;
+        }
+
+        auto bail = [this]() -> std::optional<sd::Tensor<T>> {
+            discard_pending_params();
+            free_cache_ctx_and_buffer();
+            free_compute_buffer();
+            free_compute_ctx();
+            return std::nullopt;
+        };
+
+        for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
+            const auto& segment   = plan.segments[seg_idx];
+            const bool  is_last   = seg_idx + 1 == plan.segments.size();
+            auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx);
+
+            LOG_DEBUG("%s prefetch executing segment %zu/%zu: %s (residency=%s)",
+                      get_desc().c_str(),
+                      seg_idx + 1,
+                      plan.segments.size(),
+                      segment.group_name.c_str(),
+                      segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED");
+
+            if (!layer_registry_.move_layer_to_gpu(segment.group_name)) {
+                LOG_DEBUG("%s prefetch: no registry entry for group '%s'",
+                          get_desc().c_str(), segment.group_name.c_str());
+            }
+
+            reset_segment_runtime_tensors(segment, gf);
+            if (!bind_segment_cached_inputs(gf, segment)) {
+                return bail();
+            }
+
+            if (!is_last) {
+                for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) {
+                    ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx);
+                    if (out_tensor != nullptr &&
+                        sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) &&
+                        future_cut_names.find(out_tensor->name) != future_cut_names.end()) {
+                        cache(out_tensor->name, out_tensor);
+                    }
+                }
+            }
+
+            ggml_context* segment_graph_ctx = nullptr;
+            ggml_cgraph*  segment_graph     = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx);
+
+            // Build / alloc compute buffer for this segment.
+            if (!alloc_compute_buffer(segment_graph)) {
+                LOG_ERROR("%s prefetch: alloc_compute_buffer failed", get_desc().c_str());
+                ggml_free(segment_graph_ctx);
+                return bail();
+            }
+            if (!ggml_gallocr_alloc_graph(compute_allocr, segment_graph)) {
+                LOG_ERROR("%s prefetch: alloc_graph failed", get_desc().c_str());
+                ggml_free(segment_graph_ctx);
+                return bail();
+            }
+            copy_data_to_backend_tensor(segment_graph, /*clear_after_copy=*/false);
+            if (ggml_backend_is_cpu(runtime_backend)) {
+                ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
+            }
+
+            // ASYNC compute. Returns once the work is enqueued on the
+            // compute stream; H2D issued in offload_pending_params() below
+            // will land on cudaStreamPerThread (independent of compute).
+            ggml_status status = ggml_backend_graph_compute_async(runtime_backend, segment_graph);
+            if (status != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s prefetch: graph_compute_async failed: %s",
+                          get_desc().c_str(),
+                          ggml_status_to_string(status));
+                ggml_backend_synchronize(runtime_backend);
+                ggml_free(segment_graph_ctx);
+                return bail();
+            }
+
+            // Issue prefetch for the NEXT segment while compute runs.
+            bool prefetched_next = false;
+            if (!is_last) {
+                next_params = sd::ggml_graph_cut::runtime_param_tensors(gf,
+                                                                        plan.segments[seg_idx + 1],
+                                                                        get_desc().c_str());
+                if (!offload_pending_params(next_params)) {
+                    LOG_ERROR("%s prefetch: pending offload failed at segment %zu",
+                              get_desc().c_str(), seg_idx + 1);
+                    ggml_backend_synchronize(runtime_backend);
+                    ggml_free(segment_graph_ctx);
+                    return bail();
+                }
+                prefetched_next = !pending_offload_pairs.empty();
+            }
+
+            // Wait for compute (and the just-issued H2D) to finish.
+            ggml_backend_synchronize(runtime_backend);
+
+            // Read output for the LAST segment.
+            if (is_last && !no_return) {
+                auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
+                if (result != nullptr) {
+                    output = read_graph_tensor<T>(result, "output");
+                    if (!output.has_value()) {
+                        ggml_free(segment_graph_ctx);
+                        return bail();
+                    }
+                } else {
+                    output = sd::Tensor<T>();
+                }
+            }
+
+            // Cache outputs that future segments will consume.
+            if (!copy_cache_tensors_to_cache_buffer(&future_cut_names)) {
+                ggml_free(segment_graph_ctx);
+                return bail();
+            }
+
+            ggml_free(segment_graph_ctx);
+
+            // Free the just-used compute buffer so the next segment's
+            // allocator can allocate fresh. This also tears down compute_allocr.
+            if (compute_allocr != nullptr) {
+                ggml_gallocr_free(compute_allocr);
+                compute_allocr = nullptr;
+            }
+
+            // Restore current segment's partial params (frees its GPU buffer).
+            restore_partial_params();
+
+            // Promote prefetched next-segment params to active partial state.
+            if (prefetched_next) {
+                promote_pending_to_partial();
+            }
+
+            if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) {
+                layer_registry_.move_layer_to_cpu(segment.group_name);
+            }
+        }
+
+        backend_tensor_data_map.clear();
+        free_cache_ctx_and_buffer();
+        free_compute_ctx();
+        return output;
+    }
+
+public:
+
     virtual std::string get_desc() = 0;
 
     GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend)
@@ -2638,9 +3358,11 @@ struct GGMLRunner {
         GGML_ASSERT(runtime_backend != nullptr);
         GGML_ASSERT(params_backend != nullptr);
         alloc_params_ctx();
+        layer_registry_.set_backends(runtime_backend, params_backend);
     }
 
     virtual ~GGMLRunner() {
+        restore_resident_params();
         free_params_buffer();
         free_compute_buffer();
         free_params_ctx();
@@ -2713,6 +3435,11 @@ struct GGMLRunner {
     }
 
     void free_params_buffer() {
+        // If chunk-K residency is active, our swap pointers reference the
+        // params backend buffer. Tear it down BEFORE freeing the params
+        // buffer so the swapped-back tensors don't end up with dangling
+        // pointers to a freed buffer.
+        restore_resident_params();
         if (params_buffer != nullptr) {
             ggml_backend_buffer_free(params_buffer);
             params_buffer = nullptr;
@@ -2736,6 +3463,7 @@ struct GGMLRunner {
             ggml_gallocr_free(compute_allocr);
             compute_allocr = nullptr;
         }
+        discard_pending_params();
         restore_partial_params();
         restore_all_params();
     }
@@ -2817,6 +3545,13 @@ struct GGMLRunner {
                 return std::nullopt;
             }
             if (should_use_graph_cut_segmented_compute(plan)) {
+                if (stream_layers_enabled) {
+                    return compute_streaming_segments<T>(gf,
+                                                         plan,
+                                                         n_threads,
+                                                         free_compute_buffer_immediately,
+                                                         no_return);
+                }
                 return compute_with_graph_cuts<T>(gf,
                                                   plan,
                                                   n_threads,
@@ -2857,6 +3592,12 @@ struct GGMLRunner {
         max_graph_vram_bytes = max_vram_bytes;
     }
 
+    void set_stream_layers_enabled(bool enabled) {
+        stream_layers_enabled = enabled;
+    }
+
+    sd::layer_registry::Registry& get_layer_registry() { return layer_registry_; }
+
     ggml_backend_t get_runtime_backend() {
         return runtime_backend;
     }
diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
index a20de30c9..13570408f 100644
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@@ -753,4 +753,58 @@ namespace sd::ggml_graph_cut {
         return resolved_plan;
     }
 
+    void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) {
+        // Reset any RESIDENT marks from a prior call before considering the
+        // new budget. The plan is cached across compute() invocations and
+        // the budget changes per-call (free-VRAM clamp, runner state, etc.);
+        // without this reset, segments marked RESIDENT under a larger budget
+        // would persist when the budget shrinks.
+        for (auto& seg : plan.segments) {
+            seg.residency = SegmentResidency::STREAMED;
+        }
+        if (max_graph_vram_bytes == 0 || plan.segments.size() < 2) {
+            return;
+        }
+
+        // Sanity: skip if no segment carries any params.
+        bool any_param_bearing = false;
+        for (const auto& seg : plan.segments) {
+            if (seg.input_param_bytes > 0) {
+                any_param_bearing = true;
+                break;
+            }
+        }
+        if (!any_param_bearing) {
+            return;
+        }
+
+        // Headroom budget:
+        //   - safety: minimum free VRAM (512 MB)
+        //   - compute_buffer_reserve: peak compute buffer for one streamed segment
+        // Async prefetch is not yet implemented in the streaming executor, so we
+        // do not reserve an additional segment-sized prefetch window. Once Task 11
+        // (async prefetch) lands, add `prefetch_segments * largest_streamed_segment`
+        // back here.
+        constexpr size_t safety                 = 512ull * 1024 * 1024;
+        constexpr size_t compute_buffer_reserve = 768ull * 1024 * 1024;
+        const size_t reserved                   = safety + compute_buffer_reserve;
+
+        if (max_graph_vram_bytes <= reserved) {
+            return;
+        }
+        const size_t available = max_graph_vram_bytes - reserved;
+
+        // Greedily mark segments RESIDENT from the start until we exceed the
+        // available budget. Honors heterogeneous segment sizes (small prelude +
+        // larger transformer layers) better than dividing by a single estimate.
+        size_t cumulative = 0;
+        for (auto& seg : plan.segments) {
+            if (cumulative + seg.input_param_bytes > available) {
+                break;
+            }
+            seg.residency = SegmentResidency::RESIDENT;
+            cumulative += seg.input_param_bytes;
+        }
+    }
+
 }  // namespace sd::ggml_graph_cut
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 9e5a1f755..32f5d908c 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -167,6 +167,7 @@ class StableDiffusionGGML {
     sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
     bool offload_params_to_cpu           = false;
     float max_vram                       = 0.f;
+    bool stream_layers                   = false;
     bool use_pmid                        = false;
     std::string backend_spec;
     std::string params_backend_spec;
@@ -210,9 +211,14 @@ class StableDiffusionGGML {
 
     bool init_backend(const sd_ctx_params_t* sd_ctx_params) {
         std::string error;
+        // Use the member offload_params_to_cpu (not sd_ctx_params->...) so the
+        // auto-enable in init() (when --stream-layers is set without explicit
+        // params placement) actually reaches backend_manager. Otherwise the
+        // log says "enabling --offload-to-cpu implicitly" but params still
+        // get allocated on the runtime backend.
         if (!backend_manager.init(sd_ctx_params->backend,
                                   sd_ctx_params->params_backend,
-                                  sd_ctx_params->offload_params_to_cpu,
+                                  offload_params_to_cpu,
                                   sd_ctx_params->keep_clip_on_cpu,
                                   sd_ctx_params->keep_vae_on_cpu,
                                   sd_ctx_params->keep_control_net_on_cpu,
@@ -239,8 +245,25 @@ class StableDiffusionGGML {
         free_params_immediately = sd_ctx_params->free_params_immediately;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
         max_vram                = sd_ctx_params->max_vram;
+        stream_layers           = sd_ctx_params->stream_layers;
         backend_spec            = SAFE_STR(sd_ctx_params->backend);
         params_backend_spec     = SAFE_STR(sd_ctx_params->params_backend);
+        if (stream_layers && max_vram == 0.f) {
+            // max_vram == 0 means "disabled"; negative values mean
+            // "auto-detect (free VRAM minus |max_vram| GiB)" and are
+            // resolved later in ggml_graph_cut::resolve_auto_max_vram_bytes.
+            LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring");
+            stream_layers = false;
+        }
+        if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) {
+            // Streaming needs weights somewhere they can stream FROM — i.e.
+            // params on CPU. Without --offload-to-cpu or an explicit
+            // --params-backend, weights would live on GPU, which makes
+            // streaming a silent no-op (params_backend == runtime_backend
+            // short-circuits should_use_graph_cut_segmented_compute).
+            LOG_INFO("--stream-layers requires CPU-resident weights; enabling --offload-to-cpu implicitly");
+            offload_params_to_cpu = true;
+        }
 
         bool use_tae       = false;
         bool use_audio_vae = false;
@@ -419,7 +442,18 @@ class StableDiffusionGGML {
                     }
                 }
             }
-            if (have_quantized_weight) {
+            // Immediate mode bakes LoRA into weights at load time by running
+            // a forward pass over EVERY weight tensor. That allocates a full-
+            // model-sized compute buffer (~11 GB for Z-Image bf16) on the
+            // runtime backend. On VRAM-constrained setups (--offload-to-cpu
+            // and/or --stream-layers, i.e. the whole reason streaming exists),
+            // that allocation OOMs — so AUTO must pick runtime there instead.
+            // Runtime mode is correct but slower because chunk-K residency
+            // is skipped while a weight_adapter is attached (see
+            // compute_streaming_segments).
+            const bool streaming_constrained = sd_ctx_params->stream_layers ||
+                                               sd_ctx_params->offload_params_to_cpu;
+            if (have_quantized_weight || streaming_constrained) {
                 apply_lora_immediately = false;
             } else {
                 apply_lora_immediately = true;
@@ -705,6 +739,7 @@ class StableDiffusionGGML {
             get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE));
 
             diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
+            diffusion_model->set_stream_layers_enabled(stream_layers);
             get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
 
             if (sd_version_is_unet_edit(version)) {
@@ -713,6 +748,7 @@ class StableDiffusionGGML {
 
             if (high_noise_diffusion_model) {
                 high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
+                high_noise_diffusion_model->set_stream_layers_enabled(stream_layers);
                 get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
             }
 
@@ -2277,6 +2313,21 @@ class StableDiffusionGGML {
     }
 
     sd::Tensor<float> decode_first_stage(const sd::Tensor<float>& x, bool decode_video = false) {
+        // Release diffusion's chunk-K resident set before VAE decode. With
+        // --stream-layers, the diffusion runner keeps several GB resident
+        // across sampling steps to amortize H2D. VAE decode happens once at
+        // the end, so holding diffusion's resident set during decode just
+        // starves VAE of VRAM for its compute buffer (which can be several
+        // GB at full image resolution). Eviction frees that VRAM for VAE
+        // and is paid back at the next generation's first sampling step.
+        if (stream_layers) {
+            if (diffusion_model) {
+                diffusion_model->release_streaming_residency();
+            }
+            if (high_noise_diffusion_model) {
+                high_noise_diffusion_model->release_streaming_residency();
+            }
+        }
         auto latents = first_stage_model->diffusion_to_vae_latents(x);
         first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
         return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
@@ -2592,6 +2643,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->max_vram                = 0.f;
+    sd_ctx_params->stream_layers           = false;
     sd_ctx_params->enable_mmap             = false;
     sd_ctx_params->keep_clip_on_cpu        = false;
     sd_ctx_params->keep_control_net_on_cpu = false;
@@ -2638,6 +2690,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
              "max_vram: %.3f\n"
+             "stream_layers: %s\n"
              "backend: %s\n"
              "params_backend: %s\n"
              "keep_clip_on_cpu: %s\n"
@@ -2675,6 +2728,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
              sd_ctx_params->max_vram,
+             BOOL_STR(sd_ctx_params->stream_layers),
              SAFE_STR(sd_ctx_params->backend),
              SAFE_STR(sd_ctx_params->params_backend),
              BOOL_STR(sd_ctx_params->keep_clip_on_cpu),

From 5cd49ca72793b9e37567af5db1ed905a5b7ee455 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Fri, 29 May 2026 14:34:01 +0200
Subject: [PATCH 03/11] Snapshot+restore persistent EXTERNAL inputs so runtime
 LoRA scales correctly

---
 src/ggml_extend.hpp | 63 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 5 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 093220a85..de3e2f587 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -2704,8 +2704,38 @@ struct GGMLRunner {
         return true;
     }
 
+    struct PersistentExternalBinding {
+        ggml_backend_buffer_t buffer = nullptr;
+        void* data                   = nullptr;
+        void* extra                  = nullptr;
+    };
+
+    void snapshot_persistent_externals(const sd::ggml_graph_cut::Plan& plan,
+                                       ggml_cgraph* gf,
+                                       std::unordered_map<ggml_tensor*, PersistentExternalBinding>& out) {
+        GGML_ASSERT(gf != nullptr);
+        out.clear();
+        for (const auto& segment : plan.segments) {
+            for (const auto& input : segment.input_refs) {
+                if (input.type != GraphCutSegment::INPUT_EXTERNAL) {
+                    continue;
+                }
+                ggml_tensor* tensor = sd::ggml_graph_cut::input_tensor(gf, input);
+                if (tensor == nullptr || tensor->buffer == nullptr) {
+                    continue;
+                }
+                PersistentExternalBinding binding;
+                binding.buffer = tensor->buffer;
+                binding.data   = tensor->data;
+                binding.extra  = tensor->extra;
+                out[tensor]    = binding;
+            }
+        }
+    }
+
     void reset_segment_runtime_tensors(const GraphCutSegment& segment,
-                                       ggml_cgraph* gf) {
+                                       ggml_cgraph* gf,
+                                       const std::unordered_map<ggml_tensor*, PersistentExternalBinding>* persistent_externals = nullptr) {
         GGML_ASSERT(gf != nullptr);
 
         for (const auto& input : segment.input_refs) {
@@ -2715,11 +2745,25 @@ struct GGMLRunner {
             }
             switch (input.type) {
                 case GraphCutSegment::INPUT_PREVIOUS_CUT:
-                case GraphCutSegment::INPUT_EXTERNAL:
                     input_tensor->buffer = nullptr;
                     input_tensor->data   = nullptr;
                     input_tensor->extra  = nullptr;
                     break;
+                case GraphCutSegment::INPUT_EXTERNAL: {
+                    if (persistent_externals != nullptr) {
+                        auto it = persistent_externals->find(input_tensor);
+                        if (it != persistent_externals->end()) {
+                            input_tensor->buffer = it->second.buffer;
+                            input_tensor->data   = it->second.data;
+                            input_tensor->extra  = it->second.extra;
+                            break;
+                        }
+                    }
+                    input_tensor->buffer = nullptr;
+                    input_tensor->data   = nullptr;
+                    input_tensor->extra  = nullptr;
+                    break;
+                }
                 case GraphCutSegment::INPUT_PARAM:
                     break;
             }
@@ -2934,6 +2978,9 @@ struct GGMLRunner {
         free_compute_buffer();
         free_cache_ctx_and_buffer();
 
+        std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
+        snapshot_persistent_externals(plan, gf, persistent_externals);
+
         std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
         for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
             int64_t t_segment_begin = ggml_time_ms();
@@ -2945,7 +2992,7 @@ struct GGMLRunner {
                       plan.segments.size(),
                       segment.group_name.c_str());
 
-            reset_segment_runtime_tensors(segment, gf);
+            reset_segment_runtime_tensors(segment, gf, &persistent_externals);
             if (!bind_segment_cached_inputs(gf, segment)) {
                 free_cache_ctx_and_buffer();
                 free_compute_buffer();
@@ -3107,6 +3154,9 @@ struct GGMLRunner {
             return compute_streaming_segments_prefetch<T>(gf, plan, n_threads, no_return);
         }
 
+        std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
+        snapshot_persistent_externals(plan, gf, persistent_externals);
+
         std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
         for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
             int64_t t_segment_begin = ggml_time_ms();
@@ -3131,7 +3181,7 @@ struct GGMLRunner {
                           segment.group_name.c_str());
             }
 
-            reset_segment_runtime_tensors(segment, gf);
+            reset_segment_runtime_tensors(segment, gf, &persistent_externals);
             if (!bind_segment_cached_inputs(gf, segment)) {
                 free_cache_ctx_and_buffer();
                 free_compute_buffer();
@@ -3216,6 +3266,9 @@ struct GGMLRunner {
             return std::nullopt;
         };
 
+        std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
+        snapshot_persistent_externals(plan, gf, persistent_externals);
+
         for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
             const auto& segment   = plan.segments[seg_idx];
             const bool  is_last   = seg_idx + 1 == plan.segments.size();
@@ -3233,7 +3286,7 @@ struct GGMLRunner {
                           get_desc().c_str(), segment.group_name.c_str());
             }
 
-            reset_segment_runtime_tensors(segment, gf);
+            reset_segment_runtime_tensors(segment, gf, &persistent_externals);
             if (!bind_segment_cached_inputs(gf, segment)) {
                 return bail();
             }

From 94be2a79f8808a8c43562e5fc98f163006f949a2 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sat, 30 May 2026 22:47:48 +0200
Subject: [PATCH 04/11] Warn-and-disable --stream-layers when --offload-to-cpu
 missing

---
 src/stable-diffusion.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 501b22f22..4546094ad 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -256,13 +256,11 @@ class StableDiffusionGGML {
             stream_layers = false;
         }
         if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) {
-            // Streaming needs weights somewhere they can stream FROM — i.e.
-            // params on CPU. Without --offload-to-cpu or an explicit
-            // --params-backend, weights would live on GPU, which makes
-            // streaming a silent no-op (params_backend == runtime_backend
-            // short-circuits should_use_graph_cut_segmented_compute).
-            LOG_INFO("--stream-layers requires CPU-resident weights; enabling --offload-to-cpu implicitly");
-            offload_params_to_cpu = true;
+            // Streaming needs weights on CPU to stream from. Mirror the
+            // --max-vram-missing branch above: warn and disable instead of
+            // silently flipping the caller's offload-to-cpu setting.
+            LOG_WARN("--stream-layers has no effect without --offload-to-cpu (or --params-backend); ignoring");
+            stream_layers = false;
         }
 
         bool use_tae       = false;

From fc9ddeeacde30bca5f342c634d65acdfe26e1be2 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sat, 30 May 2026 22:47:49 +0200
Subject: [PATCH 05/11] Rename Registry to LayerRegistry for clarity at call
 sites

---
 src/ggml_extend.hpp    |  4 ++--
 src/layer_registry.cpp | 10 +++++-----
 src/layer_registry.h   |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index de3e2f587..3b2f59583 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1750,7 +1750,7 @@ struct GGMLRunner {
     size_t max_graph_vram_bytes = 0;
     bool stream_layers_enabled = false;
 
-    sd::layer_registry::Registry layer_registry_;
+    sd::layer_registry::LayerRegistry layer_registry_;
 
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 
@@ -3649,7 +3649,7 @@ struct GGMLRunner {
         stream_layers_enabled = enabled;
     }
 
-    sd::layer_registry::Registry& get_layer_registry() { return layer_registry_; }
+    sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; }
 
     ggml_backend_t get_runtime_backend() {
         return runtime_backend;
diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp
index 9b809d7b2..78ece2b91 100644
--- a/src/layer_registry.cpp
+++ b/src/layer_registry.cpp
@@ -6,13 +6,13 @@
 
 namespace sd::layer_registry {
 
-void Registry::register_layer(const std::string& name, ggml_tensor* tensor) {
+void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) {
     auto& info = layers_[name];
     info.tensors.push_back(tensor);
     info.bytes += ggml_nbytes(tensor);
 }
 
-bool Registry::move_layer_to_gpu(const std::string& name) {
+bool LayerRegistry::move_layer_to_gpu(const std::string& name) {
     auto it = layers_.find(name);
     if (it == layers_.end()) return false;
 
@@ -80,7 +80,7 @@ bool Registry::move_layer_to_gpu(const std::string& name) {
     return true;
 }
 
-bool Registry::move_layer_to_cpu(const std::string& name) {
+bool LayerRegistry::move_layer_to_cpu(const std::string& name) {
     auto it = layers_.find(name);
     if (it == layers_.end()) return false;
 
@@ -114,12 +114,12 @@ bool Registry::move_layer_to_cpu(const std::string& name) {
     return true;
 }
 
-bool Registry::is_layer_on_gpu(const std::string& name) const {
+bool LayerRegistry::is_layer_on_gpu(const std::string& name) const {
     auto it = layers_.find(name);
     return it != layers_.end() && it->second.on_gpu;
 }
 
-size_t Registry::get_layer_size(const std::string& name) const {
+size_t LayerRegistry::get_layer_size(const std::string& name) const {
     auto it = layers_.find(name);
     return it != layers_.end() ? it->second.bytes : 0;
 }
diff --git a/src/layer_registry.h b/src/layer_registry.h
index d8b572db3..6727130e4 100644
--- a/src/layer_registry.h
+++ b/src/layer_registry.h
@@ -20,10 +20,10 @@ struct LayerInfo {
     size_t                    bytes      = 0;
 };
 
-class Registry {
+class LayerRegistry {
 public:
-    Registry() = default;
-    Registry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
+    LayerRegistry() = default;
+    LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
         : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
 
     void   set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {

From 06b19946ac9423fa846c7f85681801df7c11dbf4 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sun, 31 May 2026 20:45:01 +0200
Subject: [PATCH 06/11] Model active segment footprint in annotate_residency

---
 src/ggml_graph_cut.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
index 13570408f..77dba0b93 100644
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@@ -778,16 +778,23 @@ namespace sd::ggml_graph_cut {
             return;
         }
 
-        // Headroom budget:
-        //   - safety: minimum free VRAM (512 MB)
-        //   - compute_buffer_reserve: peak compute buffer for one streamed segment
-        // Async prefetch is not yet implemented in the streaming executor, so we
-        // do not reserve an additional segment-sized prefetch window. Once Task 11
-        // (async prefetch) lands, add `prefetch_segments * largest_streamed_segment`
-        // back here.
-        constexpr size_t safety                 = 512ull * 1024 * 1024;
-        constexpr size_t compute_buffer_reserve = 768ull * 1024 * 1024;
-        const size_t reserved                   = safety + compute_buffer_reserve;
+        // Model the active streamed segment's real footprint instead of a
+        // fixed reserve: params + compute buffer + output + boundary-cut
+        // inputs. The largest such footprint over the plan is the headroom
+        // the resident set must leave free.
+        size_t worst_streamed_footprint = 0;
+        for (const auto& seg : plan.segments) {
+            const size_t seg_footprint = seg.input_param_bytes +
+                                         seg.compute_buffer_size +
+                                         seg.output_bytes +
+                                         seg.input_previous_cut_bytes +
+                                         seg.input_external_bytes;
+            if (seg_footprint > worst_streamed_footprint) {
+                worst_streamed_footprint = seg_footprint;
+            }
+        }
+        constexpr size_t safety = 512ull * 1024 * 1024;
+        const size_t reserved   = safety + worst_streamed_footprint;
 
         if (max_graph_vram_bytes <= reserved) {
             return;

From 8c337ab472533d89113ddbbbc2e75fc07726e509 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sun, 31 May 2026 20:45:02 +0200
Subject: [PATCH 07/11] Drop async prefetch path and consolidate residency
 annotation

---
 src/ggml_extend.hpp | 463 ++------------------------------------------
 1 file changed, 18 insertions(+), 445 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 3b2f59583..d85f3480c 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1727,20 +1727,10 @@ struct GGMLRunner {
     ggml_backend_buffer_t partial_runtime_params_buffer = nullptr;
     std::vector<std::pair<ggml_tensor*, ggml_tensor*>> partial_offload_pairs;
 
-    // Async-prefetch staging: a parallel partial-offload track used by the
-    // streaming executor to issue the NEXT segment's H2D copies while the
-    // CURRENT segment is computing. After the current segment's compute is
-    // done and its partial state is torn down, the pending_* members are
-    // promoted into partial_* via promote_pending_to_partial().
-    ggml_context* pending_offload_ctx                   = nullptr;
-    ggml_backend_buffer_t pending_runtime_params_buffer = nullptr;
-    std::vector<std::pair<ggml_tensor*, ggml_tensor*>> pending_offload_pairs;
-
-    // chunk-K residency track: holds params that stay GPU-resident across
-    // multiple compute_streaming_segments() invocations within a generation.
-    // Coexists with the partial_* per-segment track; offload_partial_params()
-    // filters out anything already in resident_param_set so its
-    // dup-copy-swap dance doesn't tear down the resident buffer-swap.
+    // chunk-K residency: GPU-resident params shared across streaming
+    // segments. offload_partial_params() skips tensors in
+    // resident_param_set so the per-segment offload track does not tear
+    // down the resident buffer-swap.
     ggml_context* resident_offload_ctx = nullptr;
     std::vector<std::pair<ggml_tensor*, ggml_tensor*>> resident_offload_pairs;
     ggml_backend_buffer_t resident_runtime_params_buffer = nullptr;
@@ -1828,10 +1818,6 @@ struct GGMLRunner {
             ggml_free(partial_offload_ctx);
             partial_offload_ctx = nullptr;
         }
-        if (pending_offload_ctx != nullptr) {
-            ggml_free(pending_offload_ctx);
-            pending_offload_ctx = nullptr;
-        }
     }
 
     void alloc_cache_ctx() {
@@ -2347,157 +2333,9 @@ struct GGMLRunner {
         }
     }
 
-    // Issue an H2D copy for the next segment's params WITHOUT performing
-    // a buffer-swap on the source tensors yet. The dup-tensors hold the GPU
-    // data ready to swap in. Unlike offload_partial_params, this does NOT
-    // call restore_partial_params() — the current partial state must remain
-    // intact so the in-flight compute is unaffected.
-    //
-    // The H2D goes through ggml_backend_tensor_copy() which for CUDA hits
-    // cudaStreamPerThread; compute runs on cuda_ctx->stream() (a separate
-    // non-blocking stream), so the two should overlap on-device.
-    bool offload_pending_params(const std::vector<ggml_tensor*>& tensors) {
-        if (params_backend == runtime_backend) {
-            return true;
-        }
-        if (tensors.empty()) {
-            return true;
-        }
-        GGML_ASSERT(pending_offload_ctx == nullptr);
-        GGML_ASSERT(pending_runtime_params_buffer == nullptr);
-        GGML_ASSERT(pending_offload_pairs.empty());
-
-        std::vector<ggml_tensor*> unique_tensors;
-        std::unordered_set<ggml_tensor*> seen_tensors;
-        unique_tensors.reserve(tensors.size());
-        seen_tensors.reserve(tensors.size());
-        for (ggml_tensor* tensor : tensors) {
-            if (tensor == nullptr) {
-                continue;
-            }
-            if (resident_param_set.find(tensor) != resident_param_set.end()) {
-                continue;
-            }
-            if (seen_tensors.insert(tensor).second) {
-                unique_tensors.push_back(tensor);
-            }
-        }
-        if (unique_tensors.empty()) {
-            return true;
-        }
-
-        ggml_init_params params;
-        params.mem_size   = std::max<size_t>(1, unique_tensors.size()) * ggml_tensor_overhead();
-        params.mem_buffer = nullptr;
-        params.no_alloc   = true;
-
-        pending_offload_ctx = ggml_init(params);
-        GGML_ASSERT(pending_offload_ctx != nullptr);
-
-        pending_offload_pairs.clear();
-        pending_offload_pairs.reserve(unique_tensors.size());
-
-        for (ggml_tensor* tensor : unique_tensors) {
-            GGML_ASSERT(tensor->view_src == nullptr);
-            ggml_tensor* dup = ggml_dup_tensor(pending_offload_ctx, tensor);
-            ggml_set_name(dup, tensor->name);
-            pending_offload_pairs.push_back({tensor, dup});
-        }
-
-        pending_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(pending_offload_ctx, runtime_backend);
-        if (pending_runtime_params_buffer == nullptr) {
-            LOG_ERROR("%s alloc pending runtime params backend buffer failed, num_tensors = %zu",
-                      get_desc().c_str(),
-                      pending_offload_pairs.size());
-            ggml_free(pending_offload_ctx);
-            pending_offload_ctx = nullptr;
-            pending_offload_pairs.clear();
-            return false;
-        }
-        ggml_backend_buffer_set_usage(pending_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
-        // Issue the H2D copies. The src tensors are still pointing at host
-        // (or params backend) data; the dup tensors are on runtime_backend.
-        // ggml_backend_tensor_copy(host_src, gpu_dst) calls
-        // ggml_backend_tensor_set(dst, ...) which for CUDA enqueues on
-        // cudaStreamPerThread — independent of the compute stream.
-        for (auto& pair : pending_offload_pairs) {
-            ggml_tensor* tensor         = pair.first;
-            ggml_tensor* offload_tensor = pair.second;
-            ggml_backend_tensor_copy(tensor, offload_tensor);
-        }
-
-        return true;
-    }
-
-    // Promote a pending (prefetched) offload track into the active partial
-    // track. Caller MUST have already torn down the previous partial track
-    // via restore_partial_params(). After this call, partial_* owns the
-    // pending dup-tensors and the swap into the active source tensors is
-    // performed here.
-    void promote_pending_to_partial() {
-        if (pending_offload_pairs.empty()) {
-            if (pending_runtime_params_buffer != nullptr) {
-                ggml_backend_buffer_free(pending_runtime_params_buffer);
-                pending_runtime_params_buffer = nullptr;
-            }
-            if (pending_offload_ctx != nullptr) {
-                ggml_free(pending_offload_ctx);
-                pending_offload_ctx = nullptr;
-            }
-            return;
-        }
-        GGML_ASSERT(partial_offload_pairs.empty());
-        GGML_ASSERT(partial_offload_ctx == nullptr);
-        GGML_ASSERT(partial_runtime_params_buffer == nullptr);
-
-        // Swap source tensors to point at the prefetched GPU storage.
-        for (auto& pair : pending_offload_pairs) {
-            ggml_tensor* tensor         = pair.first;
-            ggml_tensor* offload_tensor = pair.second;
-            std::swap(tensor->buffer, offload_tensor->buffer);
-            std::swap(tensor->data, offload_tensor->data);
-            std::swap(tensor->extra, offload_tensor->extra);
-        }
-        partial_offload_ctx           = pending_offload_ctx;
-        partial_runtime_params_buffer = pending_runtime_params_buffer;
-        partial_offload_pairs         = std::move(pending_offload_pairs);
-        pending_offload_ctx           = nullptr;
-        pending_runtime_params_buffer = nullptr;
-        pending_offload_pairs.clear();
-    }
-
-    // Discard any in-flight pending offload track without promoting it.
-    // Used on error paths after offload_pending_params() succeeded but the
-    // subsequent compute failed.
-    void discard_pending_params() {
-        if (pending_offload_pairs.empty()) {
-            if (pending_runtime_params_buffer != nullptr) {
-                ggml_backend_buffer_free(pending_runtime_params_buffer);
-                pending_runtime_params_buffer = nullptr;
-            }
-            if (pending_offload_ctx != nullptr) {
-                ggml_free(pending_offload_ctx);
-                pending_offload_ctx = nullptr;
-            }
-            return;
-        }
-        if (pending_runtime_params_buffer != nullptr) {
-            ggml_backend_buffer_free(pending_runtime_params_buffer);
-            pending_runtime_params_buffer = nullptr;
-        }
-        pending_offload_pairs.clear();
-        if (pending_offload_ctx != nullptr) {
-            ggml_free(pending_offload_ctx);
-            pending_offload_ctx = nullptr;
-        }
-    }
-
-    // chunk-K: load a set of params onto the runtime backend and keep them
-    // resident across compute() calls. Caller MUST have torn down any prior
-    // resident state via restore_resident_params() before calling. By design
-    // this does NOT call restore_resident_params() itself — the resident
-    // track persists across compute() invocations.
+    // chunk-K: load params onto the runtime backend and keep them resident
+    // across compute() calls. Caller must release any prior resident set
+    // via restore_resident_params() first.
     bool offload_resident_params(const std::vector<ggml_tensor*>& tensors) {
         if (params_backend == runtime_backend) {
             return true;
@@ -2645,61 +2483,16 @@ struct GGMLRunner {
             }
         }
 
-        // Split the effective budget so chunk-K residency and the prefetch
-        // window (2 segments + compute buffer + safety) coexist:
-        //   effective = chunk_K + 2*segment + reserve, with chunk_K == 2*segment
-        //   => segment = (effective - reserve) / 4
-        //   => chunk_K = 2 * segment
-        constexpr size_t reserve = 512ull * 1024 * 1024 + 768ull * 1024 * 1024;
-        size_t plan_budget       = effective_budget;
-        size_t residency_budget  = effective_budget;
-        if (stream_layers_enabled && effective_budget > 0) {
-            // Always pass the full effective_budget to the planner. Earlier
-            // versions of this branch shrank the budget to (effective-reserve)/4
-            // so the prefetch path could hold two segments + compute buffer
-            // simultaneously and overlap H2D with compute. That worked for
-            // simple workloads but produced wrong output (white frames /
-            // colored noise) whenever the smaller-merged-segments combined
-            // with non-default --guidance or --flow-shift and batch_count > 1
-            // (FP error accumulates across the extra boundary-cache
-            // roundtrips). Runtime LoRA needed the same fall-back for the
-            // same reason. With the full budget the planner produces the
-            // walker's large merged segments; async prefetch can't engage
-            // (no room for two), but chunk-K residency still runs on the
-            // unmerged base plan and keeps most of the per-step H2D win.
-            plan_budget      = effective_budget;
-            residency_budget = effective_budget;
-        }
-
         *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
                                                      gf,
                                                      &graph_cut_plan_cache_,
-                                                     plan_budget,
+                                                     effective_budget,
                                                      params_tensor_set_,
                                                      get_desc().c_str());
         if (stream_layers_enabled) {
-            // chunk-K residency lives alongside ONE active merged segment
-            // at compute time (we no longer prefetch two at once). Reserve
-            // room for the largest merged segment's params, then let chunk-K
-            // fill the rest. annotate_residency internally subtracts its own
-            // safety + compute_buffer pad, so we pass it `effective - largest`
-            // and it computes available = (effective - largest) - reserve.
-            size_t largest_segment_params = 0;
-            for (const auto& seg : plan_out->segments) {
-                if (seg.input_param_bytes > largest_segment_params) {
-                    largest_segment_params = seg.input_param_bytes;
-                }
-            }
-            const size_t residency_for_annotate =
-                (residency_budget > largest_segment_params)
-                    ? (residency_budget - largest_segment_params)
-                    : 0;
-            sd::ggml_graph_cut::annotate_residency(*plan_out, residency_for_annotate);
-            LOG_INFO("%s streaming budget = %.2f MB (largest merged segment = %.2f MB, chunk-K available = %.2f MB)",
+            LOG_INFO("%s streaming budget = %.2f MB",
                      get_desc().c_str(),
-                     effective_budget / (1024.0 * 1024.0),
-                     largest_segment_params / (1024.0 * 1024.0),
-                     (residency_for_annotate > reserve ? (residency_for_annotate - reserve) : 0) / (1024.0 * 1024.0));
+                     effective_budget / (1024.0 * 1024.0));
         }
         return true;
     }
@@ -3057,38 +2850,17 @@ struct GGMLRunner {
         // chunk-K residency: annotate the BASE plan (per-layer granularity) and
         // offload its first-K segments' params once. Compute itself proceeds on
         // the MERGED plan (`plan`) for fused-graph efficiency; offload_partial_params
-        // filters out resident tensors so the merged-segment offload skips them.
-        //
-        // Disabled when a runtime weight_adapter (LoRA in at-runtime mode) is
-        // attached: chunk-K snapshots the CPU weight data once and freezes the
-        // GPU resident copy. A runtime adapter modifies the weights between
-        // calls (and even between sampling steps), which leaves the resident
-        // GPU copy stale — observable as garbage output on later batch images.
-        // Our state_token hashes tensor *pointers*, not data, so it can't
-        // detect the modification; the safe behaviour is to release any prior
-        // resident set and skip the rebuild. Async prefetch on the streaming
-        // path still engages, so the perf hit is bounded.
+        // Runtime weight_adapter (e.g. LoRA in at-runtime mode) mutates the
+        // CPU weight data between compute calls. The chunk-K resident GPU
+        // copy is snapshotted once at offload, so it would go stale; the
+        // state_token only hashes tensor pointers and cannot detect this.
+        // Release any resident set and skip rebuild in that case.
         if (weight_adapter != nullptr) {
             restore_resident_params();
         } else {
             sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
             if (base_plan.available) {
-                // Reserve headroom for the active merged-segment's params
-                // (the running compute holds them on GPU alongside chunk-K).
-                // Without this, chunk-K can grow until the largest merged
-                // segment's offload OOMs. Find the worst-case merged segment
-                // and subtract its param weight from the chunk-K budget.
-                size_t largest_merged_params = 0;
-                for (const auto& seg : plan.segments) {
-                    if (seg.input_param_bytes > largest_merged_params) {
-                        largest_merged_params = seg.input_param_bytes;
-                    }
-                }
-                const size_t chunk_k_budget =
-                    (max_graph_vram_bytes > largest_merged_params)
-                        ? (max_graph_vram_bytes - largest_merged_params)
-                        : 0;
-                sd::ggml_graph_cut::annotate_residency(base_plan, chunk_k_budget);
+                sd::ggml_graph_cut::annotate_residency(base_plan, max_graph_vram_bytes);
 
                 std::vector<ggml_tensor*> resident_params;
                 uint64_t token = 0;
@@ -3104,10 +2876,6 @@ struct GGMLRunner {
                     }
                 }
                 if (token != resident_state_token) {
-                    if (resident_state_token != 0) {
-                        LOG_DEBUG("%s chunk-K: base-plan residency changed, rebuilding resident set",
-                                  get_desc().c_str());
-                    }
                     restore_resident_params();
                     if (!resident_params.empty()) {
                         if (offload_resident_params(resident_params)) {
@@ -3125,35 +2893,10 @@ struct GGMLRunner {
         free_compute_buffer();
         free_cache_ctx_and_buffer();
 
-        // Load any tensors not associated with a segment (a "_global" group
-        // holds the head/tail params). Move to GPU once for the full run.
+        // Tensors not associated with any segment (the "_global" group holds
+        // head/tail params) move to GPU once for the full run.
         layer_registry_.move_layer_to_gpu("_global");
 
-        // Async prefetch is currently disabled. The path is still implemented
-        // (compute_streaming_segments_prefetch below) and was the source of
-        // the original 26% wallclock win on the no-LoRA path, but two
-        // correctness problems force it off for now:
-        //
-        //   * Multi-LoRA workloads: graph_compute_async + per-segment
-        //     pending offload races MultiLoraAdapter's per-layer patch_weight
-        //     reads, producing colored static noise on both batch images.
-        //   * batch_count > 1 combined with non-default --guidance AND
-        //     --flow-shift: the smaller merged segments required to fit two
-        //     prefetched buffers + compute buffer in --max-vram accumulate
-        //     enough FP error across the extra boundary-cache roundtrips to
-        //     collapse the diffusion output to pure white frames.
-        //
-        // Until both are properly fixed (most likely by porting PR #1477's
-        // chunk_graph.hpp — see unified_streaming_future_optimizations memory
-        // note), the synchronous segment loop below is the only validated
-        // streaming path. chunk-K residency still engages, which preserves
-        // most of the per-step H2D savings.
-        const bool prefetch_enabled = false;
-
-        if (prefetch_enabled) {
-            return compute_streaming_segments_prefetch<T>(gf, plan, n_threads, no_return);
-        }
-
         std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
         snapshot_persistent_externals(plan, gf, persistent_externals);
 
@@ -3232,175 +2975,6 @@ struct GGMLRunner {
         return output;
     }
 
-private:
-    // Async-prefetch streaming executor. Identical semantics to the
-    // synchronous compute_streaming_segments() but interleaves the next
-    // segment's H2D copies with the current segment's GPU compute. Requires
-    // the planner budget to have been shrunk (see resolve_graph_cut_plan)
-    // so that two segments' partial-offload buffers + the compute buffer
-    // fit inside max_graph_vram_bytes.
-    template <typename T>
-    std::optional<sd::Tensor<T>> compute_streaming_segments_prefetch(ggml_cgraph*        gf,
-                                                                     const GraphCutPlan& plan,
-                                                                     int                 n_threads,
-                                                                     bool                no_return) {
-        std::optional<sd::Tensor<T>> output = sd::Tensor<T>();
-        std::vector<ggml_tensor*> current_params;
-        std::vector<ggml_tensor*> next_params;
-
-        // Pre-offload first segment synchronously so we can begin compute.
-        current_params = sd::ggml_graph_cut::runtime_param_tensors(gf, plan.segments[0], get_desc().c_str());
-        if (!offload_partial_params(current_params)) {
-            LOG_ERROR("%s prefetch: initial partial offload failed", get_desc().c_str());
-            free_cache_ctx_and_buffer();
-            free_compute_buffer();
-            free_compute_ctx();
-            return std::nullopt;
-        }
-
-        auto bail = [this]() -> std::optional<sd::Tensor<T>> {
-            discard_pending_params();
-            free_cache_ctx_and_buffer();
-            free_compute_buffer();
-            free_compute_ctx();
-            return std::nullopt;
-        };
-
-        std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
-        snapshot_persistent_externals(plan, gf, persistent_externals);
-
-        for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
-            const auto& segment   = plan.segments[seg_idx];
-            const bool  is_last   = seg_idx + 1 == plan.segments.size();
-            auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx);
-
-            LOG_DEBUG("%s prefetch executing segment %zu/%zu: %s (residency=%s)",
-                      get_desc().c_str(),
-                      seg_idx + 1,
-                      plan.segments.size(),
-                      segment.group_name.c_str(),
-                      segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED");
-
-            if (!layer_registry_.move_layer_to_gpu(segment.group_name)) {
-                LOG_DEBUG("%s prefetch: no registry entry for group '%s'",
-                          get_desc().c_str(), segment.group_name.c_str());
-            }
-
-            reset_segment_runtime_tensors(segment, gf, &persistent_externals);
-            if (!bind_segment_cached_inputs(gf, segment)) {
-                return bail();
-            }
-
-            if (!is_last) {
-                for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) {
-                    ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx);
-                    if (out_tensor != nullptr &&
-                        sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) &&
-                        future_cut_names.find(out_tensor->name) != future_cut_names.end()) {
-                        cache(out_tensor->name, out_tensor);
-                    }
-                }
-            }
-
-            ggml_context* segment_graph_ctx = nullptr;
-            ggml_cgraph*  segment_graph     = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx);
-
-            // Build / alloc compute buffer for this segment.
-            if (!alloc_compute_buffer(segment_graph)) {
-                LOG_ERROR("%s prefetch: alloc_compute_buffer failed", get_desc().c_str());
-                ggml_free(segment_graph_ctx);
-                return bail();
-            }
-            if (!ggml_gallocr_alloc_graph(compute_allocr, segment_graph)) {
-                LOG_ERROR("%s prefetch: alloc_graph failed", get_desc().c_str());
-                ggml_free(segment_graph_ctx);
-                return bail();
-            }
-            copy_data_to_backend_tensor(segment_graph, /*clear_after_copy=*/false);
-            if (ggml_backend_is_cpu(runtime_backend)) {
-                ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
-            }
-
-            // ASYNC compute. Returns once the work is enqueued on the
-            // compute stream; H2D issued in offload_pending_params() below
-            // will land on cudaStreamPerThread (independent of compute).
-            ggml_status status = ggml_backend_graph_compute_async(runtime_backend, segment_graph);
-            if (status != GGML_STATUS_SUCCESS) {
-                LOG_ERROR("%s prefetch: graph_compute_async failed: %s",
-                          get_desc().c_str(),
-                          ggml_status_to_string(status));
-                ggml_backend_synchronize(runtime_backend);
-                ggml_free(segment_graph_ctx);
-                return bail();
-            }
-
-            // Issue prefetch for the NEXT segment while compute runs.
-            bool prefetched_next = false;
-            if (!is_last) {
-                next_params = sd::ggml_graph_cut::runtime_param_tensors(gf,
-                                                                        plan.segments[seg_idx + 1],
-                                                                        get_desc().c_str());
-                if (!offload_pending_params(next_params)) {
-                    LOG_ERROR("%s prefetch: pending offload failed at segment %zu",
-                              get_desc().c_str(), seg_idx + 1);
-                    ggml_backend_synchronize(runtime_backend);
-                    ggml_free(segment_graph_ctx);
-                    return bail();
-                }
-                prefetched_next = !pending_offload_pairs.empty();
-            }
-
-            // Wait for compute (and the just-issued H2D) to finish.
-            ggml_backend_synchronize(runtime_backend);
-
-            // Read output for the LAST segment.
-            if (is_last && !no_return) {
-                auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
-                if (result != nullptr) {
-                    output = read_graph_tensor<T>(result, "output");
-                    if (!output.has_value()) {
-                        ggml_free(segment_graph_ctx);
-                        return bail();
-                    }
-                } else {
-                    output = sd::Tensor<T>();
-                }
-            }
-
-            // Cache outputs that future segments will consume.
-            if (!copy_cache_tensors_to_cache_buffer(&future_cut_names)) {
-                ggml_free(segment_graph_ctx);
-                return bail();
-            }
-
-            ggml_free(segment_graph_ctx);
-
-            // Free the just-used compute buffer so the next segment's
-            // allocator can allocate fresh. This also tears down compute_allocr.
-            if (compute_allocr != nullptr) {
-                ggml_gallocr_free(compute_allocr);
-                compute_allocr = nullptr;
-            }
-
-            // Restore current segment's partial params (frees its GPU buffer).
-            restore_partial_params();
-
-            // Promote prefetched next-segment params to active partial state.
-            if (prefetched_next) {
-                promote_pending_to_partial();
-            }
-
-            if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) {
-                layer_registry_.move_layer_to_cpu(segment.group_name);
-            }
-        }
-
-        backend_tensor_data_map.clear();
-        free_cache_ctx_and_buffer();
-        free_compute_ctx();
-        return output;
-    }
-
 public:
 
     virtual std::string get_desc() = 0;
@@ -3516,7 +3090,6 @@ struct GGMLRunner {
             ggml_gallocr_free(compute_allocr);
             compute_allocr = nullptr;
         }
-        discard_pending_params();
         restore_partial_params();
         restore_all_params();
     }

From 60d00cc35c479af52503212067696a8841f283de Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Tue, 2 Jun 2026 21:50:35 +0800
Subject: [PATCH 08/11] use clamped streaming budget for residency

---
 src/ggml_extend.hpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index c7f6c540e..987fe3ab3 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -2429,7 +2429,8 @@ struct GGMLRunner {
     }
 
     bool resolve_graph_cut_plan(ggml_cgraph* gf,
-                                GraphCutPlan* plan_out) {
+                                GraphCutPlan* plan_out,
+                                size_t* effective_budget_out = nullptr) {
         GGML_ASSERT(plan_out != nullptr);
         GGML_ASSERT(gf != nullptr);
 
@@ -2455,6 +2456,10 @@ struct GGMLRunner {
             }
         }
 
+        if (effective_budget_out != nullptr) {
+            *effective_budget_out = effective_budget;
+        }
+
         *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
                                                      gf,
                                                      &graph_cut_plan_cache_,
@@ -2814,6 +2819,7 @@ struct GGMLRunner {
     template <typename T>
     std::optional<sd::Tensor<T>> compute_streaming_segments(ggml_cgraph*        gf,
                                                             const GraphCutPlan& plan,
+                                                            size_t              residency_budget_bytes,
                                                             int                 n_threads,
                                                             bool                free_compute_buffer_immediately,
                                                             bool                no_return = false) {
@@ -2832,7 +2838,7 @@ struct GGMLRunner {
         } else {
             sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
             if (base_plan.available) {
-                sd::ggml_graph_cut::annotate_residency(base_plan, max_graph_vram_bytes);
+                sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
 
                 std::vector<ggml_tensor*> resident_params;
                 uint64_t token = 0;
@@ -3138,7 +3144,8 @@ struct GGMLRunner {
 
         if (can_attempt_graph_cut_segmented_compute()) {
             GraphCutPlan plan;
-            if (!resolve_graph_cut_plan(gf, &plan)) {
+            size_t effective_graph_vram_bytes = 0;
+            if (!resolve_graph_cut_plan(gf, &plan, &effective_graph_vram_bytes)) {
                 free_compute_ctx();
                 return std::nullopt;
             }
@@ -3146,6 +3153,7 @@ struct GGMLRunner {
                 if (stream_layers_enabled) {
                     return compute_streaming_segments<T>(gf,
                                                          plan,
+                                                         effective_graph_vram_bytes,
                                                          n_threads,
                                                          free_compute_buffer_immediately,
                                                          no_return);

From d097f86ef2b416a9421cc4812ffa0a6cc7f3d547 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Tue, 2 Jun 2026 21:52:47 +0800
Subject: [PATCH 09/11] use validated streaming flag for lora auto mode

---
 src/stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index b79bbcdc9..1bfba8a32 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -471,7 +471,7 @@ class StableDiffusionGGML {
             // Runtime mode is correct but slower because chunk-K residency
             // is skipped while a weight_adapter is attached (see
             // compute_streaming_segments).
-            const bool streaming_constrained = sd_ctx_params->stream_layers ||
+            const bool streaming_constrained = stream_layers ||
                                                sd_ctx_params->offload_params_to_cpu;
             if (have_quantized_weight || streaming_constrained) {
                 apply_lora_immediately = false;

From 30a4c8e1134a96f781fae70b1ec47acbd0436663 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Tue, 2 Jun 2026 22:02:21 +0800
Subject: [PATCH 10/11] trim streaming implementation comments

---
 src/ggml_extend.hpp      | 42 ++++++----------------------------------
 src/ggml_graph_cut.cpp   | 15 ++------------
 src/ggml_graph_cut.h     | 16 ++-------------
 src/stable-diffusion.cpp | 25 +++---------------------
 4 files changed, 13 insertions(+), 85 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 987fe3ab3..08a08036f 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1699,10 +1699,7 @@ struct GGMLRunner {
     ggml_backend_buffer_t partial_runtime_params_buffer = nullptr;
     std::vector<std::pair<ggml_tensor*, ggml_tensor*>> partial_offload_pairs;
 
-    // chunk-K residency: GPU-resident params shared across streaming
-    // segments. offload_partial_params() skips tensors in
-    // resident_param_set so the per-segment offload track does not tear
-    // down the resident buffer-swap.
+    // Params kept on the runtime backend across streaming segments.
     ggml_context* resident_offload_ctx = nullptr;
     std::vector<std::pair<ggml_tensor*, ggml_tensor*>> resident_offload_pairs;
     ggml_backend_buffer_t resident_runtime_params_buffer = nullptr;
@@ -2181,7 +2178,7 @@ struct GGMLRunner {
                 continue;
             }
             if (resident_param_set.find(tensor) != resident_param_set.end()) {
-                continue;  // already resident on GPU via the chunk-K track
+                continue;
             }
             if (seen_tensors.insert(tensor).second) {
                 unique_tensors.push_back(tensor);
@@ -2305,9 +2302,6 @@ struct GGMLRunner {
         }
     }
 
-    // chunk-K: load params onto the runtime backend and keep them resident
-    // across compute() calls. Caller must release any prior resident set
-    // via restore_resident_params() first.
     bool offload_resident_params(const std::vector<ggml_tensor*>& tensors) {
         if (params_backend == runtime_backend) {
             return true;
@@ -2434,10 +2428,7 @@ struct GGMLRunner {
         GGML_ASSERT(plan_out != nullptr);
         GGML_ASSERT(gf != nullptr);
 
-        // Start from the user-set budget. When streaming is on, clamp to actual
-        // free VRAM at this moment so we don't overcommit when another runner
-        // (e.g. the LLM in cond_stage) already grabbed a chunk-K resident set
-        // earlier in the generation.
+        // Keep the plan and resident params under the same live-VRAM cap.
         size_t effective_budget = max_graph_vram_bytes;
         if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
             ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
@@ -2808,10 +2799,6 @@ struct GGMLRunner {
     }
 
 public:
-    // Release chunk-K resident params held across compute() calls. Safe to
-    // call at any point — if no resident state exists this is a no-op.
-    // stable-diffusion.cpp uses this to evict the diffusion model's resident
-    // set before VAE decode so VAE has VRAM for its compute buffer.
     void release_streaming_residency() {
         restore_resident_params();
     }
@@ -2825,14 +2812,8 @@ struct GGMLRunner {
                                                             bool                no_return = false) {
         GGML_ASSERT(gf != nullptr);
 
-        // chunk-K residency: annotate the BASE plan (per-layer granularity) and
-        // offload its first-K segments' params once. Compute itself proceeds on
-        // the MERGED plan (`plan`) for fused-graph efficiency; offload_partial_params
-        // Runtime weight_adapter (e.g. LoRA in at-runtime mode) mutates the
-        // CPU weight data between compute calls. The chunk-K resident GPU
-        // copy is snapshotted once at offload, so it would go stale; the
-        // state_token only hashes tensor pointers and cannot detect this.
-        // Release any resident set and skip rebuild in that case.
+        // Runtime LoRA mutates CPU weights between calls, so resident GPU
+        // copies would go stale.
         if (weight_adapter != nullptr) {
             restore_resident_params();
         } else {
@@ -2871,8 +2852,6 @@ struct GGMLRunner {
         free_compute_buffer();
         free_cache_ctx_and_buffer();
 
-        // Tensors not associated with any segment (the "_global" group holds
-        // head/tail params) move to GPU once for the full run.
         layer_registry_.move_layer_to_gpu("_global");
 
         std::unordered_map<ggml_tensor*, PersistentExternalBinding> persistent_externals;
@@ -2892,10 +2871,6 @@ struct GGMLRunner {
                       segment.group_name.c_str(),
                       segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED");
 
-            // Move this segment's layer group onto GPU (skipped for resident
-            // groups already loaded). Registry is currently unpopulated at
-            // runtime, so this is a harmless no-op until chunk-K residency
-            // lands in a later task.
             if (!layer_registry_.move_layer_to_gpu(segment.group_name)) {
                 LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)",
                           get_desc().c_str(),
@@ -2939,8 +2914,6 @@ struct GGMLRunner {
             }
             output = std::move(segment_output);
 
-            // Evict this segment's layer group when streaming. Resident groups
-            // stay loaded for the next sampling step.
             if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) {
                 layer_registry_.move_layer_to_cpu(segment.group_name);
             }
@@ -3040,10 +3013,7 @@ struct GGMLRunner {
     }
 
     void free_params_buffer() {
-        // If chunk-K residency is active, our swap pointers reference the
-        // params backend buffer. Tear it down BEFORE freeing the params
-        // buffer so the swapped-back tensors don't end up with dangling
-        // pointers to a freed buffer.
+        // Restore swapped resident params before freeing their backing buffer.
         restore_resident_params();
         if (params_buffer != nullptr) {
             ggml_backend_buffer_free(params_buffer);
diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
index 77dba0b93..61234eaf2 100644
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@@ -754,11 +754,7 @@ namespace sd::ggml_graph_cut {
     }
 
     void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) {
-        // Reset any RESIDENT marks from a prior call before considering the
-        // new budget. The plan is cached across compute() invocations and
-        // the budget changes per-call (free-VRAM clamp, runner state, etc.);
-        // without this reset, segments marked RESIDENT under a larger budget
-        // would persist when the budget shrinks.
+        // Cached plans may be reused with a smaller live budget.
         for (auto& seg : plan.segments) {
             seg.residency = SegmentResidency::STREAMED;
         }
@@ -766,7 +762,6 @@ namespace sd::ggml_graph_cut {
             return;
         }
 
-        // Sanity: skip if no segment carries any params.
         bool any_param_bearing = false;
         for (const auto& seg : plan.segments) {
             if (seg.input_param_bytes > 0) {
@@ -778,10 +773,7 @@ namespace sd::ggml_graph_cut {
             return;
         }
 
-        // Model the active streamed segment's real footprint instead of a
-        // fixed reserve: params + compute buffer + output + boundary-cut
-        // inputs. The largest such footprint over the plan is the headroom
-        // the resident set must leave free.
+        // Leave room for the largest active streamed segment.
         size_t worst_streamed_footprint = 0;
         for (const auto& seg : plan.segments) {
             const size_t seg_footprint = seg.input_param_bytes +
@@ -801,9 +793,6 @@ namespace sd::ggml_graph_cut {
         }
         const size_t available = max_graph_vram_bytes - reserved;
 
-        // Greedily mark segments RESIDENT from the start until we exceed the
-        // available budget. Honors heterogeneous segment sizes (small prelude +
-        // larger transformer layers) better than dividing by a single estimate.
         size_t cumulative = 0;
         for (auto& seg : plan.segments) {
             if (cumulative + seg.input_param_bytes > available) {
diff --git a/src/ggml_graph_cut.h b/src/ggml_graph_cut.h
index 7943f48d6..9668bc38e 100644
--- a/src/ggml_graph_cut.h
+++ b/src/ggml_graph_cut.h
@@ -12,11 +12,7 @@
 
 namespace sd::ggml_graph_cut {
 
-    // Whether a segment's params live on the GPU across the entire generation
-    // (RESIDENT) or get streamed in per sampling step then evicted (STREAMED).
-    // Only populated when the planner is invoked with stream_layers_enabled=true;
-    // otherwise every segment is implicitly STREAMED and the existing walker in
-    // GGMLRunner::compute() handles it (upstream behavior unchanged).
+    // Streaming residency for a segment's params.
     enum class SegmentResidency : uint8_t {
         STREAMED = 0,
         RESIDENT = 1,
@@ -114,15 +110,7 @@ namespace sd::ggml_graph_cut {
                       const std::unordered_set<const ggml_tensor*>& params_tensor_set,
                       const char* log_desc);
 
-    // Annotate the first K segments of plan as RESIDENT, where K is the
-    // number of leading segments that fit in the residency budget. K is
-    // determined by:
-    //   K = (max_graph_vram_bytes - prefetch_reserve - safety
-    //        - compute_buffer_reserve) / per_segment_param_bytes,
-    //   clamped to plan.segments.size().
-    //
-    // If max_graph_vram_bytes is 0 or the plan has fewer than 2 segments,
-    // this is a no-op (no annotation; behavior matches upstream).
+    // Mark leading segments resident when they fit after streamed-segment headroom.
     void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
 }  // namespace sd::ggml_graph_cut
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 1bfba8a32..03aa29005 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -233,11 +233,6 @@ class StableDiffusionGGML {
 
     bool init_backend(const sd_ctx_params_t* sd_ctx_params) {
         std::string error;
-        // Use the member offload_params_to_cpu (not sd_ctx_params->...) so the
-        // auto-enable in init() (when --stream-layers is set without explicit
-        // params placement) actually reaches backend_manager. Otherwise the
-        // log says "enabling --offload-to-cpu implicitly" but params still
-        // get allocated on the runtime backend.
         if (!backend_manager.init(sd_ctx_params->backend,
                                   sd_ctx_params->params_backend,
                                   offload_params_to_cpu,
@@ -271,16 +266,11 @@ class StableDiffusionGGML {
         backend_spec            = SAFE_STR(sd_ctx_params->backend);
         params_backend_spec     = SAFE_STR(sd_ctx_params->params_backend);
         if (stream_layers && max_vram == 0.f) {
-            // max_vram == 0 means "disabled"; negative values mean
-            // "auto-detect (free VRAM minus |max_vram| GiB)" and are
-            // resolved later in ggml_graph_cut::resolve_auto_max_vram_bytes.
             LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring");
             stream_layers = false;
         }
         if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) {
-            // Streaming needs weights on CPU to stream from. Mirror the
-            // --max-vram-missing branch above: warn and disable instead of
-            // silently flipping the caller's offload-to-cpu setting.
+            // Streaming needs CPU-resident params.
             LOG_WARN("--stream-layers has no effect without --offload-to-cpu (or --params-backend); ignoring");
             stream_layers = false;
         }
@@ -462,15 +452,7 @@ class StableDiffusionGGML {
                     }
                 }
             }
-            // Immediate mode bakes LoRA into weights at load time by running
-            // a forward pass over EVERY weight tensor. That allocates a full-
-            // model-sized compute buffer (~11 GB for Z-Image bf16) on the
-            // runtime backend. On VRAM-constrained setups (--offload-to-cpu
-            // and/or --stream-layers, i.e. the whole reason streaming exists),
-            // that allocation OOMs — so AUTO must pick runtime there instead.
-            // Runtime mode is correct but slower because chunk-K residency
-            // is skipped while a weight_adapter is attached (see
-            // compute_streaming_segments).
+            // Avoid full-model LoRA merge buffers on constrained setups.
             const bool streaming_constrained = stream_layers ||
                                                sd_ctx_params->offload_params_to_cpu;
             if (have_quantized_weight || streaming_constrained) {
@@ -2398,8 +2380,7 @@ class StableDiffusionGGML {
         if (sd_version_is_pid(version)) {
             return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f);
         }
-        // chunk-K residency held across sampling steps starves VAE decode
-        // of VRAM for its compute buffer; release it before decode.
+        // Free resident diffusion params before VAE allocates its compute buffer.
         if (stream_layers) {
             if (diffusion_model) {
                 diffusion_model->release_streaming_residency();

From 084ccb52291c6f734d1f3c66b734899eabe032f0 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Tue, 2 Jun 2026 22:24:06 +0800
Subject: [PATCH 11/11] format code

---
 src/ggml_extend.hpp      |  59 +++++------
 src/layer_registry.cpp   | 211 ++++++++++++++++++++-------------------
 src/layer_registry.h     |  67 +++++++------
 src/stable-diffusion.cpp |   8 +-
 4 files changed, 177 insertions(+), 168 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 08a08036f..ef104368a 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1707,7 +1707,7 @@ struct GGMLRunner {
     uint64_t resident_state_token = 0;
 
     size_t max_graph_vram_bytes = 0;
-    bool stream_layers_enabled = false;
+    bool stream_layers_enabled  = false;
 
     sd::layer_registry::LayerRegistry layer_registry_;
 
@@ -2319,16 +2319,19 @@ struct GGMLRunner {
         unique_tensors.reserve(tensors.size());
         seen.reserve(tensors.size());
         for (ggml_tensor* t : tensors) {
-            if (t == nullptr) continue;
-            if (seen.insert(t).second) unique_tensors.push_back(t);
+            if (t == nullptr)
+                continue;
+            if (seen.insert(t).second)
+                unique_tensors.push_back(t);
         }
-        if (unique_tensors.empty()) return true;
+        if (unique_tensors.empty())
+            return true;
 
         ggml_init_params init = {};
-        init.mem_size   = std::max<size_t>(1, unique_tensors.size()) * ggml_tensor_overhead();
-        init.mem_buffer = nullptr;
-        init.no_alloc   = true;
-        resident_offload_ctx = ggml_init(init);
+        init.mem_size         = std::max<size_t>(1, unique_tensors.size()) * ggml_tensor_overhead();
+        init.mem_buffer       = nullptr;
+        init.no_alloc         = true;
+        resident_offload_ctx  = ggml_init(init);
         GGML_ASSERT(resident_offload_ctx != nullptr);
 
         resident_offload_pairs.reserve(unique_tensors.size());
@@ -2351,12 +2354,12 @@ struct GGMLRunner {
         ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
 
         for (auto& pair : resident_offload_pairs) {
-            ggml_tensor* t = pair.first;
+            ggml_tensor* t    = pair.first;
             ggml_tensor* twin = pair.second;
             ggml_backend_tensor_copy(t, twin);
             std::swap(t->buffer, twin->buffer);
-            std::swap(t->data,   twin->data);
-            std::swap(t->extra,  twin->extra);
+            std::swap(t->data, twin->data);
+            std::swap(t->extra, twin->extra);
             resident_param_set.insert(t);
         }
         ggml_backend_synchronize(runtime_backend);
@@ -2385,14 +2388,14 @@ struct GGMLRunner {
             return;
         }
         for (auto& pair : resident_offload_pairs) {
-            ggml_tensor* t = pair.first;
+            ggml_tensor* t    = pair.first;
             ggml_tensor* twin = pair.second;
-            t->buffer = twin->buffer;
-            t->data   = twin->data;
-            t->extra  = twin->extra;
-            twin->buffer = nullptr;
-            twin->data   = nullptr;
-            twin->extra  = nullptr;
+            t->buffer         = twin->buffer;
+            t->data           = twin->data;
+            t->extra          = twin->extra;
+            twin->buffer      = nullptr;
+            twin->data        = nullptr;
+            twin->extra       = nullptr;
         }
         if (resident_runtime_params_buffer != nullptr) {
             ggml_backend_buffer_free(resident_runtime_params_buffer);
@@ -2436,7 +2439,7 @@ struct GGMLRunner {
                 size_t free_vram = 0, total_vram = 0;
                 ggml_backend_dev_memory(dev, &free_vram, &total_vram);
                 constexpr size_t safety_margin = 512ull * 1024 * 1024;
-                size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
+                size_t free_clamp              = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
                 if (free_clamp < effective_budget) {
                     LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB",
                              get_desc().c_str(),
@@ -2804,12 +2807,12 @@ struct GGMLRunner {
     }
 
     template <typename T>
-    std::optional<sd::Tensor<T>> compute_streaming_segments(ggml_cgraph*        gf,
+    std::optional<sd::Tensor<T>> compute_streaming_segments(ggml_cgraph* gf,
                                                             const GraphCutPlan& plan,
-                                                            size_t              residency_budget_bytes,
-                                                            int                 n_threads,
-                                                            bool                free_compute_buffer_immediately,
-                                                            bool                no_return = false) {
+                                                            size_t residency_budget_bytes,
+                                                            int n_threads,
+                                                            bool free_compute_buffer_immediately,
+                                                            bool no_return = false) {
         GGML_ASSERT(gf != nullptr);
 
         // Runtime LoRA mutates CPU weights between calls, so resident GPU
@@ -2829,7 +2832,8 @@ struct GGMLRunner {
                     }
                     auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
                     for (ggml_tensor* t : seg_params) {
-                        if (t == nullptr) continue;
+                        if (t == nullptr)
+                            continue;
                         resident_params.push_back(t);
                         token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
                     }
@@ -2861,7 +2865,7 @@ struct GGMLRunner {
         for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) {
             int64_t t_segment_begin = ggml_time_ms();
             const auto& segment     = plan.segments[seg_idx];
-            const bool  is_last     = seg_idx + 1 == plan.segments.size();
+            const bool is_last      = seg_idx + 1 == plan.segments.size();
             auto future_cut_names   = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx);
 
             LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)",
@@ -2897,7 +2901,7 @@ struct GGMLRunner {
             }
 
             ggml_context* segment_graph_ctx = nullptr;
-            ggml_cgraph*  segment_graph     = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx);
+            ggml_cgraph* segment_graph      = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx);
             auto segment_output             = execute_graph<T>(segment_graph,
                                                    n_threads,
                                                    /*free_compute_buffer_immediately=*/true,
@@ -2927,7 +2931,6 @@ struct GGMLRunner {
     }
 
 public:
-
     virtual std::string get_desc() = 0;
 
     GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend)
diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp
index 78ece2b91..c4ff18814 100644
--- a/src/layer_registry.cpp
+++ b/src/layer_registry.cpp
@@ -6,122 +6,127 @@
 
 namespace sd::layer_registry {
 
-void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) {
-    auto& info = layers_[name];
-    info.tensors.push_back(tensor);
-    info.bytes += ggml_nbytes(tensor);
-}
-
-bool LayerRegistry::move_layer_to_gpu(const std::string& name) {
-    auto it = layers_.find(name);
-    if (it == layers_.end()) return false;
-
-    LayerInfo& info = it->second;
-    if (info.on_gpu) return true;
-    if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) {
-        LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU",
-                  name.c_str());
-        return false;
-    }
-    if (info.tensors.empty()) {
-        info.on_gpu = true;
-        return true;
+    void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) {
+        auto& info = layers_[name];
+        info.tensors.push_back(tensor);
+        info.bytes += ggml_nbytes(tensor);
     }
 
-    // 1. Build a no_alloc context big enough to hold one twin tensor per CPU
-    //    tensor, plus a little overhead.
-    const size_t     ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024;
-    ggml_init_params ctx_params{ ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true };
-    ggml_context*    twin_ctx = ggml_init(ctx_params);
-    if (twin_ctx == nullptr) {
-        LOG_ERROR("layer_registry: failed to allocate twin context for '%s'",
-                  name.c_str());
-        return false;
-    }
+    bool LayerRegistry::move_layer_to_gpu(const std::string& name) {
+        auto it = layers_.find(name);
+        if (it == layers_.end())
+            return false;
+
+        LayerInfo& info = it->second;
+        if (info.on_gpu)
+            return true;
+        if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) {
+            LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU",
+                      name.c_str());
+            return false;
+        }
+        if (info.tensors.empty()) {
+            info.on_gpu = true;
+            return true;
+        }
 
-    // 2. Create one GPU twin per CPU tensor. The twin shares the original
-    //    name so any name-based lookup keeps working.
-    std::vector<ggml_tensor*> gpu_twins;
-    gpu_twins.reserve(info.tensors.size());
-    for (ggml_tensor* cpu_t : info.tensors) {
-        ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t);
-        if (cpu_t->name[0] != '\0') {
-            ggml_set_name(twin, cpu_t->name);
+        // 1. Build a no_alloc context big enough to hold one twin tensor per CPU
+        //    tensor, plus a little overhead.
+        const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024;
+        ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true};
+        ggml_context* twin_ctx = ggml_init(ctx_params);
+        if (twin_ctx == nullptr) {
+            LOG_ERROR("layer_registry: failed to allocate twin context for '%s'",
+                      name.c_str());
+            return false;
         }
-        gpu_twins.push_back(twin);
-    }
 
-    // 3. Back the twins with a GPU buffer in one alloc call.
-    ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_);
-    if (gpu_buffer == nullptr) {
-        LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'",
-                  name.c_str());
-        ggml_free(twin_ctx);
-        return false;
-    }
+        // 2. Create one GPU twin per CPU tensor. The twin shares the original
+        //    name so any name-based lookup keeps working.
+        std::vector<ggml_tensor*> gpu_twins;
+        gpu_twins.reserve(info.tensors.size());
+        for (ggml_tensor* cpu_t : info.tensors) {
+            ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t);
+            if (cpu_t->name[0] != '\0') {
+                ggml_set_name(twin, cpu_t->name);
+            }
+            gpu_twins.push_back(twin);
+        }
 
-    // 4. H2D copy + sync.
-    for (size_t i = 0; i < info.tensors.size(); ++i) {
-        ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]);
-    }
-    ggml_backend_synchronize(gpu_backend_);
+        // 3. Back the twins with a GPU buffer in one alloc call.
+        ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_);
+        if (gpu_buffer == nullptr) {
+            LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'",
+                      name.c_str());
+            ggml_free(twin_ctx);
+            return false;
+        }
 
-    // 5. Swap buffer/data/extra so the originals now point at GPU memory.
-    for (size_t i = 0; i < info.tensors.size(); ++i) {
-        std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer);
-        std::swap(info.tensors[i]->data,   gpu_twins[i]->data);
-        std::swap(info.tensors[i]->extra,  gpu_twins[i]->extra);
-    }
+        // 4. H2D copy + sync.
+        for (size_t i = 0; i < info.tensors.size(); ++i) {
+            ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]);
+        }
+        ggml_backend_synchronize(gpu_backend_);
 
-    info.gpu_twins  = std::move(gpu_twins);
-    info.twin_ctx   = twin_ctx;
-    info.gpu_buffer = gpu_buffer;
-    info.on_gpu     = true;
-    return true;
-}
-
-bool LayerRegistry::move_layer_to_cpu(const std::string& name) {
-    auto it = layers_.find(name);
-    if (it == layers_.end()) return false;
-
-    LayerInfo& info = it->second;
-    if (!info.on_gpu) return true;
-    if (info.tensors.size() != info.gpu_twins.size()) {
-        LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'",
-                  name.c_str());
-        return false;
+        // 5. Swap buffer/data/extra so the originals now point at GPU memory.
+        for (size_t i = 0; i < info.tensors.size(); ++i) {
+            std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer);
+            std::swap(info.tensors[i]->data, gpu_twins[i]->data);
+            std::swap(info.tensors[i]->extra, gpu_twins[i]->extra);
+        }
+
+        info.gpu_twins  = std::move(gpu_twins);
+        info.twin_ctx   = twin_ctx;
+        info.gpu_buffer = gpu_buffer;
+        info.on_gpu     = true;
+        return true;
     }
 
-    // 1. Swap back: originals point at CPU memory again.
-    for (size_t i = 0; i < info.tensors.size(); ++i) {
-        if (info.gpu_twins[i] == nullptr) continue;
-        std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer);
-        std::swap(info.tensors[i]->data,   info.gpu_twins[i]->data);
-        std::swap(info.tensors[i]->extra,  info.gpu_twins[i]->extra);
+    bool LayerRegistry::move_layer_to_cpu(const std::string& name) {
+        auto it = layers_.find(name);
+        if (it == layers_.end())
+            return false;
+
+        LayerInfo& info = it->second;
+        if (!info.on_gpu)
+            return true;
+        if (info.tensors.size() != info.gpu_twins.size()) {
+            LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'",
+                      name.c_str());
+            return false;
+        }
+
+        // 1. Swap back: originals point at CPU memory again.
+        for (size_t i = 0; i < info.tensors.size(); ++i) {
+            if (info.gpu_twins[i] == nullptr)
+                continue;
+            std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer);
+            std::swap(info.tensors[i]->data, info.gpu_twins[i]->data);
+            std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra);
+        }
+
+        // 2. Free the GPU buffer + twin context.
+        if (info.gpu_buffer != nullptr) {
+            ggml_backend_buffer_free(info.gpu_buffer);
+            info.gpu_buffer = nullptr;
+        }
+        if (info.twin_ctx != nullptr) {
+            ggml_free(info.twin_ctx);
+            info.twin_ctx = nullptr;
+        }
+        info.gpu_twins.clear();
+        info.on_gpu = false;
+        return true;
     }
 
-    // 2. Free the GPU buffer + twin context.
-    if (info.gpu_buffer != nullptr) {
-        ggml_backend_buffer_free(info.gpu_buffer);
-        info.gpu_buffer = nullptr;
+    bool LayerRegistry::is_layer_on_gpu(const std::string& name) const {
+        auto it = layers_.find(name);
+        return it != layers_.end() && it->second.on_gpu;
     }
-    if (info.twin_ctx != nullptr) {
-        ggml_free(info.twin_ctx);
-        info.twin_ctx = nullptr;
+
+    size_t LayerRegistry::get_layer_size(const std::string& name) const {
+        auto it = layers_.find(name);
+        return it != layers_.end() ? it->second.bytes : 0;
     }
-    info.gpu_twins.clear();
-    info.on_gpu = false;
-    return true;
-}
-
-bool LayerRegistry::is_layer_on_gpu(const std::string& name) const {
-    auto it = layers_.find(name);
-    return it != layers_.end() && it->second.on_gpu;
-}
-
-size_t LayerRegistry::get_layer_size(const std::string& name) const {
-    auto it = layers_.find(name);
-    return it != layers_.end() ? it->second.bytes : 0;
-}
 
 }  // namespace sd::layer_registry
diff --git a/src/layer_registry.h b/src/layer_registry.h
index 6727130e4..2dc755349 100644
--- a/src/layer_registry.h
+++ b/src/layer_registry.h
@@ -6,43 +6,44 @@
 #include <string>
 #include <vector>
 
-#include "ggml.h"
 #include "ggml-backend.h"
+#include "ggml.h"
 
 namespace sd::layer_registry {
 
-struct LayerInfo {
-    std::vector<ggml_tensor*> tensors;
-    std::vector<ggml_tensor*> gpu_twins;
-    ggml_context*             twin_ctx   = nullptr;
-    ggml_backend_buffer_t     gpu_buffer = nullptr;
-    bool                      on_gpu     = false;
-    size_t                    bytes      = 0;
-};
-
-class LayerRegistry {
-public:
-    LayerRegistry() = default;
-    LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
-        : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
-
-    void   set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {
-        gpu_backend_ = gpu_backend; cpu_backend_ = cpu_backend;
-    }
-    void   register_layer(const std::string& name, ggml_tensor* tensor);
-    bool   move_layer_to_gpu(const std::string& name);
-    bool   move_layer_to_cpu(const std::string& name);
-    bool   is_layer_on_gpu(const std::string& name) const;
-    size_t get_layer_size(const std::string& name) const;
-    size_t get_layer_count() const { return layers_.size(); }
-
-    const std::map<std::string, LayerInfo>& layers() const { return layers_; }
-
-private:
-    ggml_backend_t                   gpu_backend_ = nullptr;
-    ggml_backend_t                   cpu_backend_ = nullptr;
-    std::map<std::string, LayerInfo> layers_;
-};
+    struct LayerInfo {
+        std::vector<ggml_tensor*> tensors;
+        std::vector<ggml_tensor*> gpu_twins;
+        ggml_context* twin_ctx           = nullptr;
+        ggml_backend_buffer_t gpu_buffer = nullptr;
+        bool on_gpu                      = false;
+        size_t bytes                     = 0;
+    };
+
+    class LayerRegistry {
+    public:
+        LayerRegistry() = default;
+        LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
+            : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
+
+        void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {
+            gpu_backend_ = gpu_backend;
+            cpu_backend_ = cpu_backend;
+        }
+        void register_layer(const std::string& name, ggml_tensor* tensor);
+        bool move_layer_to_gpu(const std::string& name);
+        bool move_layer_to_cpu(const std::string& name);
+        bool is_layer_on_gpu(const std::string& name) const;
+        size_t get_layer_size(const std::string& name) const;
+        size_t get_layer_count() const { return layers_.size(); }
+
+        const std::map<std::string, LayerInfo>& layers() const { return layers_; }
+
+    private:
+        ggml_backend_t gpu_backend_ = nullptr;
+        ggml_backend_t cpu_backend_ = nullptr;
+        std::map<std::string, LayerInfo> layers_;
+    };
 
 }  // namespace sd::layer_registry
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 03aa29005..9434fa5ed 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -4192,7 +4192,7 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
             std::vector<sd::Tensor<float>> empty_ref_images;
             condition_params.ref_images = &empty_ref_images;
             uncond                      = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
-                                                                         condition_params);
+                                                                                              condition_params);
             if (uncond.c_concat.empty()) {
                 uncond.c_concat = latents->uncond_concat_latent;  // TODO: optimize
             }
@@ -4210,9 +4210,9 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
 
     ImageGenerationEmbeds embeds;
     embeds.img_cond = std::move(img_cond);
-    embeds.cond    = std::move(cond);
-    embeds.uncond  = std::move(uncond);
-    embeds.id_cond = std::move(id_cond);
+    embeds.cond     = std::move(cond);
+    embeds.uncond   = std::move(uncond);
+    embeds.id_cond  = std::move(id_cond);
 
     return embeds;
 }