From 6317804781ef87aa3bac1ddbae3488e4fff57431 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Fri, 29 May 2026 02:13:30 +0200 Subject: [PATCH 01/11] Foundation: add --stream-layers flag, planner annotation, layer registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface area for the unified-streaming design. No behaviour change when --stream-layers is unset — the dispatch in compute() (added in the follow-up commit on src/ggml_extend.hpp) short-circuits to the upstream walker. - `sd_ctx_params_t::stream_layers` field (include/stable-diffusion.h). - `--stream-layers` boolean CLI flag (examples/common/common.{h,cpp}). - `sd::ggml_graph_cut::SegmentResidency` enum + Segment::residency field + annotate_residency declaration (src/ggml_graph_cut.h). - `sd::layer_registry::Registry` with register/move primitives using the proven dup-copy-swap idiom on tensor->buffer/data/extra (new src/layer_registry.{h,cpp}). - Conditioner subclasses gain a virtual set_stream_layers_enabled (src/conditioner.hpp). UpscalerGGML forwards to its inner runner (src/upscaler.{h,cpp}). --- examples/common/common.cpp | 6 ++ examples/common/common.h | 1 + include/stable-diffusion.h | 1 + src/conditioner.hpp | 43 +++++++++++++ src/ggml_graph_cut.h | 23 +++++++ src/layer_registry.cpp | 127 +++++++++++++++++++++++++++++++++++++ src/layer_registry.h | 49 ++++++++++++++ src/upscaler.cpp | 8 +++ src/upscaler.h | 2 + 9 files changed, 260 insertions(+) create mode 100644 src/layer_registry.cpp create mode 100644 src/layer_registry.h diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 519e8aae6..7878d4e36 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -418,6 +418,10 @@ ArgOptions SDContextParams::get_options() { }; options.bool_options = { + {"", + "--stream-layers", + "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", + true, &stream_layers}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -694,6 +698,7 @@ std::string SDContextParams::to_string() const { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" + << " stream_layers: " << (stream_layers ? "true" : "false") << ",\n" << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" @@ -773,6 +778,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f chroma_t5_mask_pad, qwen_image_zero_cond_t, max_vram, + stream_layers, backend.c_str(), params_backend.c_str(), }; diff --git a/examples/common/common.h b/examples/common/common.h index ca367f7ee..35475fa8c 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -112,6 +112,7 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; float max_vram = 0.f; + bool stream_layers = false; std::string backend; std::string params_backend; bool enable_mmap = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index f8b2c2f59..a7555020d 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -213,6 +213,7 @@ typedef struct { int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) + bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) const char* backend; const char* params_backend; } sd_ctx_params_t; diff --git a/src/conditioner.hpp b/src/conditioner.hpp index f08feeef7..ba4af189e 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -118,6 +118,7 @@ struct Conditioner { virtual void get_param_tensors(std::map& tensors) = 0; virtual size_t get_params_buffer_size() = 0; virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} + virtual void set_stream_layers_enabled(bool enabled) {} virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} virtual std::tuple> get_learned_condition_with_trigger(int n_threads, @@ -205,6 +206,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + text_model->set_stream_layers_enabled(enabled); + if (sd_version_is_sdxl(version)) { + text_model2->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { text_model->set_flash_attention_enabled(enabled); if (sd_version_is_sdxl(version)) { @@ -831,6 +839,18 @@ struct SD3CLIPEmbedder : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + if (clip_l) { + clip_l->set_stream_layers_enabled(enabled); + } + if (clip_g) { + clip_g->set_stream_layers_enabled(enabled); + } + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { if (clip_l) { clip_l->set_flash_attention_enabled(enabled); @@ -1183,6 +1203,15 @@ struct FluxCLIPEmbedder : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + if (clip_l) { + clip_l->set_stream_layers_enabled(enabled); + } + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { if (clip_l) { clip_l->set_flash_attention_enabled(enabled); @@ -1414,6 +1443,12 @@ struct T5CLIPEmbedder : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { if (t5) { t5->set_flash_attention_enabled(enabled); @@ -1594,6 +1629,10 @@ struct AnimaConditioner : public Conditioner { llm->set_max_graph_vram_bytes(max_vram_bytes); } + void set_stream_layers_enabled(bool enabled) override { + llm->set_stream_layers_enabled(enabled); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } @@ -1735,6 +1774,10 @@ struct LLMEmbedder : public Conditioner { llm->set_max_graph_vram_bytes(max_vram_bytes); } + void set_stream_layers_enabled(bool enabled) override { + llm->set_stream_layers_enabled(enabled); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } diff --git a/src/ggml_graph_cut.h b/src/ggml_graph_cut.h index dcd1551a7..7943f48d6 100644 --- a/src/ggml_graph_cut.h +++ b/src/ggml_graph_cut.h @@ -2,6 +2,7 @@ #define __SD_GGML_GRAPH_CUT_H__ #include +#include #include #include #include @@ -11,6 +12,16 @@ namespace sd::ggml_graph_cut { + // Whether a segment's params live on the GPU across the entire generation + // (RESIDENT) or get streamed in per sampling step then evicted (STREAMED). + // Only populated when the planner is invoked with stream_layers_enabled=true; + // otherwise every segment is implicitly STREAMED and the existing walker in + // GGMLRunner::compute() handles it (upstream behavior unchanged). + enum class SegmentResidency : uint8_t { + STREAMED = 0, + RESIDENT = 1, + }; + struct Segment { enum InputType { INPUT_EXTERNAL = 0, @@ -34,6 +45,7 @@ namespace sd::ggml_graph_cut { std::vector internal_node_indices; std::vector output_node_indices; std::vector input_refs; + SegmentResidency residency = SegmentResidency::STREAMED; }; struct Plan { @@ -101,6 +113,17 @@ namespace sd::ggml_graph_cut { size_t max_graph_vram_bytes, const std::unordered_set& params_tensor_set, const char* log_desc); + + // Annotate the first K segments of plan as RESIDENT, where K is the + // number of leading segments that fit in the residency budget. K is + // determined by: + // K = (max_graph_vram_bytes - prefetch_reserve - safety + // - compute_buffer_reserve) / per_segment_param_bytes, + // clamped to plan.segments.size(). + // + // If max_graph_vram_bytes is 0 or the plan has fewer than 2 segments, + // this is a no-op (no annotation; behavior matches upstream). + void annotate_residency(Plan& plan, size_t max_graph_vram_bytes); } // namespace sd::ggml_graph_cut #endif diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp new file mode 100644 index 000000000..9b809d7b2 --- /dev/null +++ b/src/layer_registry.cpp @@ -0,0 +1,127 @@ +#include "layer_registry.h" + +#include + +#include "util.h" + +namespace sd::layer_registry { + +void Registry::register_layer(const std::string& name, ggml_tensor* tensor) { + auto& info = layers_[name]; + info.tensors.push_back(tensor); + info.bytes += ggml_nbytes(tensor); +} + +bool Registry::move_layer_to_gpu(const std::string& name) { + auto it = layers_.find(name); + if (it == layers_.end()) return false; + + LayerInfo& info = it->second; + if (info.on_gpu) return true; + if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) { + LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU", + name.c_str()); + return false; + } + if (info.tensors.empty()) { + info.on_gpu = true; + return true; + } + + // 1. Build a no_alloc context big enough to hold one twin tensor per CPU + // tensor, plus a little overhead. + const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024; + ggml_init_params ctx_params{ ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true }; + ggml_context* twin_ctx = ggml_init(ctx_params); + if (twin_ctx == nullptr) { + LOG_ERROR("layer_registry: failed to allocate twin context for '%s'", + name.c_str()); + return false; + } + + // 2. Create one GPU twin per CPU tensor. The twin shares the original + // name so any name-based lookup keeps working. + std::vector gpu_twins; + gpu_twins.reserve(info.tensors.size()); + for (ggml_tensor* cpu_t : info.tensors) { + ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t); + if (cpu_t->name[0] != '\0') { + ggml_set_name(twin, cpu_t->name); + } + gpu_twins.push_back(twin); + } + + // 3. Back the twins with a GPU buffer in one alloc call. + ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_); + if (gpu_buffer == nullptr) { + LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'", + name.c_str()); + ggml_free(twin_ctx); + return false; + } + + // 4. H2D copy + sync. + for (size_t i = 0; i < info.tensors.size(); ++i) { + ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]); + } + ggml_backend_synchronize(gpu_backend_); + + // 5. Swap buffer/data/extra so the originals now point at GPU memory. + for (size_t i = 0; i < info.tensors.size(); ++i) { + std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer); + std::swap(info.tensors[i]->data, gpu_twins[i]->data); + std::swap(info.tensors[i]->extra, gpu_twins[i]->extra); + } + + info.gpu_twins = std::move(gpu_twins); + info.twin_ctx = twin_ctx; + info.gpu_buffer = gpu_buffer; + info.on_gpu = true; + return true; +} + +bool Registry::move_layer_to_cpu(const std::string& name) { + auto it = layers_.find(name); + if (it == layers_.end()) return false; + + LayerInfo& info = it->second; + if (!info.on_gpu) return true; + if (info.tensors.size() != info.gpu_twins.size()) { + LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'", + name.c_str()); + return false; + } + + // 1. Swap back: originals point at CPU memory again. + for (size_t i = 0; i < info.tensors.size(); ++i) { + if (info.gpu_twins[i] == nullptr) continue; + std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer); + std::swap(info.tensors[i]->data, info.gpu_twins[i]->data); + std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra); + } + + // 2. Free the GPU buffer + twin context. + if (info.gpu_buffer != nullptr) { + ggml_backend_buffer_free(info.gpu_buffer); + info.gpu_buffer = nullptr; + } + if (info.twin_ctx != nullptr) { + ggml_free(info.twin_ctx); + info.twin_ctx = nullptr; + } + info.gpu_twins.clear(); + info.on_gpu = false; + return true; +} + +bool Registry::is_layer_on_gpu(const std::string& name) const { + auto it = layers_.find(name); + return it != layers_.end() && it->second.on_gpu; +} + +size_t Registry::get_layer_size(const std::string& name) const { + auto it = layers_.find(name); + return it != layers_.end() ? it->second.bytes : 0; +} + +} // namespace sd::layer_registry diff --git a/src/layer_registry.h b/src/layer_registry.h new file mode 100644 index 000000000..d8b572db3 --- /dev/null +++ b/src/layer_registry.h @@ -0,0 +1,49 @@ +#ifndef __LAYER_REGISTRY_H__ +#define __LAYER_REGISTRY_H__ + +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-backend.h" + +namespace sd::layer_registry { + +struct LayerInfo { + std::vector tensors; + std::vector gpu_twins; + ggml_context* twin_ctx = nullptr; + ggml_backend_buffer_t gpu_buffer = nullptr; + bool on_gpu = false; + size_t bytes = 0; +}; + +class Registry { +public: + Registry() = default; + Registry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) + : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {} + + void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) { + gpu_backend_ = gpu_backend; cpu_backend_ = cpu_backend; + } + void register_layer(const std::string& name, ggml_tensor* tensor); + bool move_layer_to_gpu(const std::string& name); + bool move_layer_to_cpu(const std::string& name); + bool is_layer_on_gpu(const std::string& name) const; + size_t get_layer_size(const std::string& name) const; + size_t get_layer_count() const { return layers_.size(); } + + const std::map& layers() const { return layers_; } + +private: + ggml_backend_t gpu_backend_ = nullptr; + ggml_backend_t cpu_backend_ = nullptr; + std::map layers_; +}; + +} // namespace sd::layer_registry + +#endif diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 1197ce35e..cef26e36d 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -25,6 +25,13 @@ void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) { } } +void UpscalerGGML::set_stream_layers_enabled(bool enabled) { + stream_layers_enabled = enabled; + if (esrgan_upscaler) { + esrgan_upscaler->set_stream_layers_enabled(enabled); + } +} + bool UpscalerGGML::load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, int n_threads) { @@ -76,6 +83,7 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, tile_size, model_loader.get_tensor_storage_map()); esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); + esrgan_upscaler->set_stream_layers_enabled(stream_layers_enabled); if (direct) { esrgan_upscaler->set_conv2d_direct_enabled(true); } diff --git a/src/upscaler.h b/src/upscaler.h index e3967865c..e2197694d 100644 --- a/src/upscaler.h +++ b/src/upscaler.h @@ -18,6 +18,7 @@ struct UpscalerGGML { bool direct = false; int tile_size = 128; size_t max_graph_vram_bytes = 0; + bool stream_layers_enabled = false; std::string backend_spec; std::string params_backend_spec; @@ -31,6 +32,7 @@ struct UpscalerGGML { bool offload_params_to_cpu, int n_threads); void set_max_graph_vram_bytes(size_t max_vram_bytes); + void set_stream_layers_enabled(bool enabled); sd::Tensor upscale_tensor(const sd::Tensor& input_tensor); sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor); }; From 010e7a619822f41c0d3366585b62f9b0b36127a0 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Fri, 29 May 2026 02:13:30 +0200 Subject: [PATCH 02/11] chunk-K residency, multi-runner safety, and LoRA correctness fallbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds on the foundation commit (--stream-layers + planner annotation + executor scaffolding) with: ## chunk-K residency A parallel `resident_*` offload track on `GGMLRunner` keeps a fraction of the diffusion model's params on GPU permanently across sampling steps, amortising H2D over many invocations. - Members: `resident_offload_ctx`, `resident_offload_pairs`, `resident_runtime_params_buffer`, `resident_param_set`, `resident_state_token` (parallel to the existing `partial_offload_*` per-segment track). - `offload_resident_params(tensors)` / `restore_resident_params()` use the same dup-copy-swap idiom as `offload_partial_params` but write to the resident slot and persist across `compute()` calls. - `offload_partial_params` filters tensors already in `resident_param_set` so per-segment offload skips them. `restore_resident_params` is hooked into `~GGMLRunner()` and `free_params_buffer()` to keep swap pointers valid through teardown. - `compute_streaming_segments` reads `graph_cut_plan_cache_.graph_cut_plan` (the unmerged base plan), annotates it, gathers the union of RESIDENT segments' param tensors, and offloads them once. Compute itself proceeds on the merged plan for fused-graph efficiency. A commutative pointer-hash state-token detects when a different plan is in play and rebuilds the resident set. `annotate_residency` updates: - "Any param-bearing segment exists" sanity replaces the `segments[0].input_param_bytes == 0` early-return (wrong for diffusion models whose first segment is a small prelude). - Greedy cumulative-bytes loop handles heterogeneous segment sizes (small prelude + large transformer layers). - Resets `seg.residency = STREAMED` at entry so cached plans don't carry forward stale RESIDENT marks from a previous larger-budget call. - Don't reserve a `prefetch_segments * largest_segment` window; async prefetch is no longer used (see below). ## Multi-runner safety - Per-runner free-VRAM clamp at compute time in `resolve_graph_cut_plan`. Each runner queries `ggml_backend_dev_memory(runtime_backend)` and clamps `effective_budget = min(max_vram, free - 512 MB)` per call. Without this, after the LLM committed ~7 GB chunk-K resident the diffusion runner still believed it had the whole budget and OOM'd. - `--stream-layers` is restricted to diffusion runners only (diffusion_model + high_noise_diffusion_model) — matches PR #1477's scope and avoids one-shot runners (LLM, VAE, clip_vision, upscaler) claiming permanent chunk-K state that starves the diffusion model. - `GGMLRunner::release_streaming_residency()` (public trampoline to `restore_resident_params`) is called from `decode_first_stage()` on diffusion_model + high_noise_diffusion_model right before VAE decode. Without it the 6.5 GB chunk-K residency from sampling would starve VAE's compute buffer (~4.5 GB at full image resolution) and OOM at decode. ## `LORA_APPLY_AUTO` picks runtime when streaming or CPU-offload is on Immediate mode bakes LoRA into weights at load time by running a forward pass over every weight tensor — allocates a full-model-size (~11 GB on Z-Image bf16) compute buffer on the runtime backend in one shot and OOMs on any VRAM-constrained setup, which is the whole reason `--stream-layers` / `--offload-to-cpu` exist. AUTO previously only picked runtime for quantized models; now `stream_layers || offload_params_to_cpu` is also a trigger. ## Conservative streaming v4 (current shipping configuration) After observing edge-case failures with prefetch + multi-LoRA / + non-default --guidance/--flow-shift / + batch_count > 1, the final configuration is: - chunk-K residency skips itself when `weight_adapter != nullptr`. The state-token hashes tensor *pointers*, not data, so it can't detect MultiLoraAdapter modifications across batch images / steps; the symptom was colored static noise on batch image 2+. - Async prefetch is hard-disabled (`prefetch_enabled = false`). The `compute_streaming_segments_prefetch` implementation stays in the file for future reference but is currently dead. Two correctness problems forced the disable: * Multi-LoRA workloads: graph_compute_async + per-segment pending offload races MultiLoraAdapter's per-layer patch_weight reads. * batch_count > 1 + non-default --guidance + --flow-shift: the smaller merged segments required to fit two prefetched buffers in --max-vram accumulate FP error across the extra boundary- cache roundtrips → collapses to pure white frames. - `resolve_graph_cut_plan` always passes the full `effective_budget` to the planner (no `/4` shrinking). Produces the upstream walker's large merged segments — the validated configuration. - The chunk-K hook reserves room for the LARGEST merged segment's params: `chunk_k_budget = max_graph_vram_bytes - largest_merged_segment`. Without this reservation chunk-K could grow large enough that the active merged-segment offload OOMs. ## Verification - Z-Image Q8 with `--stream-layers off`: byte-identical to upstream walker. - Z-Image bf16 1024x688 + 2 LoRAs + `--cfg-scale 1.0 --guidance 3.5 --flow-shift 3.0` + batch_count=2 (the production REST API failure mode that motivated the correctness work): both batch images clean and distinct. - Smoke-matrix-verified across Z-Image Q8/bf16, HiDream, Qwen, Flux schnell, SD3.5, SDXL, Anima, WAN. --- src/ggml_extend.hpp | 743 ++++++++++++++++++++++++++++++++++++++- src/ggml_graph_cut.cpp | 54 +++ src/stable-diffusion.cpp | 58 ++- 3 files changed, 852 insertions(+), 3 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 9178a31e8..093220a85 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -28,6 +28,7 @@ #include "ggml.h" #include "ggml_extend_backend.h" #include "ggml_graph_cut.h" +#include "layer_registry.h" #include "model.h" #include "tensor.hpp" @@ -1725,7 +1726,31 @@ struct GGMLRunner { ggml_context* partial_offload_ctx = nullptr; ggml_backend_buffer_t partial_runtime_params_buffer = nullptr; std::vector> partial_offload_pairs; + + // Async-prefetch staging: a parallel partial-offload track used by the + // streaming executor to issue the NEXT segment's H2D copies while the + // CURRENT segment is computing. After the current segment's compute is + // done and its partial state is torn down, the pending_* members are + // promoted into partial_* via promote_pending_to_partial(). + ggml_context* pending_offload_ctx = nullptr; + ggml_backend_buffer_t pending_runtime_params_buffer = nullptr; + std::vector> pending_offload_pairs; + + // chunk-K residency track: holds params that stay GPU-resident across + // multiple compute_streaming_segments() invocations within a generation. + // Coexists with the partial_* per-segment track; offload_partial_params() + // filters out anything already in resident_param_set so its + // dup-copy-swap dance doesn't tear down the resident buffer-swap. + ggml_context* resident_offload_ctx = nullptr; + std::vector> resident_offload_pairs; + ggml_backend_buffer_t resident_runtime_params_buffer = nullptr; + std::unordered_set resident_param_set; + uint64_t resident_state_token = 0; + size_t max_graph_vram_bytes = 0; + bool stream_layers_enabled = false; + + sd::layer_registry::Registry layer_registry_; std::shared_ptr weight_adapter = nullptr; @@ -1803,6 +1828,10 @@ struct GGMLRunner { ggml_free(partial_offload_ctx); partial_offload_ctx = nullptr; } + if (pending_offload_ctx != nullptr) { + ggml_free(pending_offload_ctx); + pending_offload_ctx = nullptr; + } } void alloc_cache_ctx() { @@ -2193,6 +2222,9 @@ struct GGMLRunner { if (tensor == nullptr) { continue; } + if (resident_param_set.find(tensor) != resident_param_set.end()) { + continue; // already resident on GPU via the chunk-K track + } if (seen_tensors.insert(tensor).second) { unique_tensors.push_back(tensor); } @@ -2315,6 +2347,262 @@ struct GGMLRunner { } } + // Issue an H2D copy for the next segment's params WITHOUT performing + // a buffer-swap on the source tensors yet. The dup-tensors hold the GPU + // data ready to swap in. Unlike offload_partial_params, this does NOT + // call restore_partial_params() — the current partial state must remain + // intact so the in-flight compute is unaffected. + // + // The H2D goes through ggml_backend_tensor_copy() which for CUDA hits + // cudaStreamPerThread; compute runs on cuda_ctx->stream() (a separate + // non-blocking stream), so the two should overlap on-device. + bool offload_pending_params(const std::vector& tensors) { + if (params_backend == runtime_backend) { + return true; + } + if (tensors.empty()) { + return true; + } + GGML_ASSERT(pending_offload_ctx == nullptr); + GGML_ASSERT(pending_runtime_params_buffer == nullptr); + GGML_ASSERT(pending_offload_pairs.empty()); + + std::vector unique_tensors; + std::unordered_set seen_tensors; + unique_tensors.reserve(tensors.size()); + seen_tensors.reserve(tensors.size()); + for (ggml_tensor* tensor : tensors) { + if (tensor == nullptr) { + continue; + } + if (resident_param_set.find(tensor) != resident_param_set.end()) { + continue; + } + if (seen_tensors.insert(tensor).second) { + unique_tensors.push_back(tensor); + } + } + if (unique_tensors.empty()) { + return true; + } + + ggml_init_params params; + params.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); + params.mem_buffer = nullptr; + params.no_alloc = true; + + pending_offload_ctx = ggml_init(params); + GGML_ASSERT(pending_offload_ctx != nullptr); + + pending_offload_pairs.clear(); + pending_offload_pairs.reserve(unique_tensors.size()); + + for (ggml_tensor* tensor : unique_tensors) { + GGML_ASSERT(tensor->view_src == nullptr); + ggml_tensor* dup = ggml_dup_tensor(pending_offload_ctx, tensor); + ggml_set_name(dup, tensor->name); + pending_offload_pairs.push_back({tensor, dup}); + } + + pending_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(pending_offload_ctx, runtime_backend); + if (pending_runtime_params_buffer == nullptr) { + LOG_ERROR("%s alloc pending runtime params backend buffer failed, num_tensors = %zu", + get_desc().c_str(), + pending_offload_pairs.size()); + ggml_free(pending_offload_ctx); + pending_offload_ctx = nullptr; + pending_offload_pairs.clear(); + return false; + } + ggml_backend_buffer_set_usage(pending_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Issue the H2D copies. The src tensors are still pointing at host + // (or params backend) data; the dup tensors are on runtime_backend. + // ggml_backend_tensor_copy(host_src, gpu_dst) calls + // ggml_backend_tensor_set(dst, ...) which for CUDA enqueues on + // cudaStreamPerThread — independent of the compute stream. + for (auto& pair : pending_offload_pairs) { + ggml_tensor* tensor = pair.first; + ggml_tensor* offload_tensor = pair.second; + ggml_backend_tensor_copy(tensor, offload_tensor); + } + + return true; + } + + // Promote a pending (prefetched) offload track into the active partial + // track. Caller MUST have already torn down the previous partial track + // via restore_partial_params(). After this call, partial_* owns the + // pending dup-tensors and the swap into the active source tensors is + // performed here. + void promote_pending_to_partial() { + if (pending_offload_pairs.empty()) { + if (pending_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(pending_runtime_params_buffer); + pending_runtime_params_buffer = nullptr; + } + if (pending_offload_ctx != nullptr) { + ggml_free(pending_offload_ctx); + pending_offload_ctx = nullptr; + } + return; + } + GGML_ASSERT(partial_offload_pairs.empty()); + GGML_ASSERT(partial_offload_ctx == nullptr); + GGML_ASSERT(partial_runtime_params_buffer == nullptr); + + // Swap source tensors to point at the prefetched GPU storage. + for (auto& pair : pending_offload_pairs) { + ggml_tensor* tensor = pair.first; + ggml_tensor* offload_tensor = pair.second; + std::swap(tensor->buffer, offload_tensor->buffer); + std::swap(tensor->data, offload_tensor->data); + std::swap(tensor->extra, offload_tensor->extra); + } + partial_offload_ctx = pending_offload_ctx; + partial_runtime_params_buffer = pending_runtime_params_buffer; + partial_offload_pairs = std::move(pending_offload_pairs); + pending_offload_ctx = nullptr; + pending_runtime_params_buffer = nullptr; + pending_offload_pairs.clear(); + } + + // Discard any in-flight pending offload track without promoting it. + // Used on error paths after offload_pending_params() succeeded but the + // subsequent compute failed. + void discard_pending_params() { + if (pending_offload_pairs.empty()) { + if (pending_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(pending_runtime_params_buffer); + pending_runtime_params_buffer = nullptr; + } + if (pending_offload_ctx != nullptr) { + ggml_free(pending_offload_ctx); + pending_offload_ctx = nullptr; + } + return; + } + if (pending_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(pending_runtime_params_buffer); + pending_runtime_params_buffer = nullptr; + } + pending_offload_pairs.clear(); + if (pending_offload_ctx != nullptr) { + ggml_free(pending_offload_ctx); + pending_offload_ctx = nullptr; + } + } + + // chunk-K: load a set of params onto the runtime backend and keep them + // resident across compute() calls. Caller MUST have torn down any prior + // resident state via restore_resident_params() before calling. By design + // this does NOT call restore_resident_params() itself — the resident + // track persists across compute() invocations. + bool offload_resident_params(const std::vector& tensors) { + if (params_backend == runtime_backend) { + return true; + } + if (tensors.empty()) { + return true; + } + GGML_ASSERT(resident_runtime_params_buffer == nullptr); + GGML_ASSERT(resident_offload_ctx == nullptr); + GGML_ASSERT(resident_offload_pairs.empty()); + GGML_ASSERT(resident_param_set.empty()); + + std::vector unique_tensors; + std::unordered_set seen; + unique_tensors.reserve(tensors.size()); + seen.reserve(tensors.size()); + for (ggml_tensor* t : tensors) { + if (t == nullptr) continue; + if (seen.insert(t).second) unique_tensors.push_back(t); + } + if (unique_tensors.empty()) return true; + + ggml_init_params init = {}; + init.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); + init.mem_buffer = nullptr; + init.no_alloc = true; + resident_offload_ctx = ggml_init(init); + GGML_ASSERT(resident_offload_ctx != nullptr); + + resident_offload_pairs.reserve(unique_tensors.size()); + for (ggml_tensor* t : unique_tensors) { + GGML_ASSERT(t->view_src == nullptr); + ggml_tensor* twin = ggml_dup_tensor(resident_offload_ctx, t); + ggml_set_name(twin, t->name); + resident_offload_pairs.push_back({t, twin}); + } + + resident_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(resident_offload_ctx, runtime_backend); + if (resident_runtime_params_buffer == nullptr) { + LOG_ERROR("%s alloc resident runtime params backend buffer failed, num_tensors = %zu", + get_desc().c_str(), resident_offload_pairs.size()); + ggml_free(resident_offload_ctx); + resident_offload_ctx = nullptr; + resident_offload_pairs.clear(); + return false; + } + ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + for (auto& pair : resident_offload_pairs) { + ggml_tensor* t = pair.first; + ggml_tensor* twin = pair.second; + ggml_backend_tensor_copy(t, twin); + std::swap(t->buffer, twin->buffer); + std::swap(t->data, twin->data); + std::swap(t->extra, twin->extra); + resident_param_set.insert(t); + } + ggml_backend_synchronize(runtime_backend); + + size_t sz = ggml_backend_buffer_get_size(resident_runtime_params_buffer); + LOG_INFO("%s offload resident params (%6.2f MB, %zu tensors) to runtime backend (%s)", + get_desc().c_str(), + sz / (1024.f * 1024.f), + resident_offload_pairs.size(), + ggml_backend_name(runtime_backend)); + return true; + } + + void restore_resident_params() { + if (resident_offload_pairs.empty()) { + if (resident_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(resident_runtime_params_buffer); + resident_runtime_params_buffer = nullptr; + } + if (resident_offload_ctx != nullptr) { + ggml_free(resident_offload_ctx); + resident_offload_ctx = nullptr; + } + resident_param_set.clear(); + resident_state_token = 0; + return; + } + for (auto& pair : resident_offload_pairs) { + ggml_tensor* t = pair.first; + ggml_tensor* twin = pair.second; + t->buffer = twin->buffer; + t->data = twin->data; + t->extra = twin->extra; + twin->buffer = nullptr; + twin->data = nullptr; + twin->extra = nullptr; + } + if (resident_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(resident_runtime_params_buffer); + resident_runtime_params_buffer = nullptr; + } + resident_offload_pairs.clear(); + if (resident_offload_ctx != nullptr) { + ggml_free(resident_offload_ctx); + resident_offload_ctx = nullptr; + } + resident_param_set.clear(); + resident_state_token = 0; + } + bool should_use_graph_cut_segmented_compute(const GraphCutPlan& plan) { return plan.has_cuts && plan.valid && @@ -2334,12 +2622,85 @@ struct GGMLRunner { GraphCutPlan* plan_out) { GGML_ASSERT(plan_out != nullptr); GGML_ASSERT(gf != nullptr); + + // Start from the user-set budget. When streaming is on, clamp to actual + // free VRAM at this moment so we don't overcommit when another runner + // (e.g. the LLM in cond_stage) already grabbed a chunk-K resident set + // earlier in the generation. + size_t effective_budget = max_graph_vram_bytes; + if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) { + ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend); + if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + size_t free_vram = 0, total_vram = 0; + ggml_backend_dev_memory(dev, &free_vram, &total_vram); + constexpr size_t safety_margin = 512ull * 1024 * 1024; + size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0; + if (free_clamp < effective_budget) { + LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB", + get_desc().c_str(), + free_clamp / (1024.0 * 1024.0), + effective_budget / (1024.0 * 1024.0)); + effective_budget = free_clamp; + } + } + } + + // Split the effective budget so chunk-K residency and the prefetch + // window (2 segments + compute buffer + safety) coexist: + // effective = chunk_K + 2*segment + reserve, with chunk_K == 2*segment + // => segment = (effective - reserve) / 4 + // => chunk_K = 2 * segment + constexpr size_t reserve = 512ull * 1024 * 1024 + 768ull * 1024 * 1024; + size_t plan_budget = effective_budget; + size_t residency_budget = effective_budget; + if (stream_layers_enabled && effective_budget > 0) { + // Always pass the full effective_budget to the planner. Earlier + // versions of this branch shrank the budget to (effective-reserve)/4 + // so the prefetch path could hold two segments + compute buffer + // simultaneously and overlap H2D with compute. That worked for + // simple workloads but produced wrong output (white frames / + // colored noise) whenever the smaller-merged-segments combined + // with non-default --guidance or --flow-shift and batch_count > 1 + // (FP error accumulates across the extra boundary-cache + // roundtrips). Runtime LoRA needed the same fall-back for the + // same reason. With the full budget the planner produces the + // walker's large merged segments; async prefetch can't engage + // (no room for two), but chunk-K residency still runs on the + // unmerged base plan and keeps most of the per-step H2D win. + plan_budget = effective_budget; + residency_budget = effective_budget; + } + *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf, &graph_cut_plan_cache_, - max_graph_vram_bytes, + plan_budget, params_tensor_set_, get_desc().c_str()); + if (stream_layers_enabled) { + // chunk-K residency lives alongside ONE active merged segment + // at compute time (we no longer prefetch two at once). Reserve + // room for the largest merged segment's params, then let chunk-K + // fill the rest. annotate_residency internally subtracts its own + // safety + compute_buffer pad, so we pass it `effective - largest` + // and it computes available = (effective - largest) - reserve. + size_t largest_segment_params = 0; + for (const auto& seg : plan_out->segments) { + if (seg.input_param_bytes > largest_segment_params) { + largest_segment_params = seg.input_param_bytes; + } + } + const size_t residency_for_annotate = + (residency_budget > largest_segment_params) + ? (residency_budget - largest_segment_params) + : 0; + sd::ggml_graph_cut::annotate_residency(*plan_out, residency_for_annotate); + LOG_INFO("%s streaming budget = %.2f MB (largest merged segment = %.2f MB, chunk-K available = %.2f MB)", + get_desc().c_str(), + effective_budget / (1024.0 * 1024.0), + largest_segment_params / (1024.0 * 1024.0), + (residency_for_annotate > reserve ? (residency_for_annotate - reserve) : 0) / (1024.0 * 1024.0)); + } return true; } @@ -2630,6 +2991,365 @@ struct GGMLRunner { } public: + // Release chunk-K resident params held across compute() calls. Safe to + // call at any point — if no resident state exists this is a no-op. + // stable-diffusion.cpp uses this to evict the diffusion model's resident + // set before VAE decode so VAE has VRAM for its compute buffer. + void release_streaming_residency() { + restore_resident_params(); + } + + template + std::optional> compute_streaming_segments(ggml_cgraph* gf, + const GraphCutPlan& plan, + int n_threads, + bool free_compute_buffer_immediately, + bool no_return = false) { + GGML_ASSERT(gf != nullptr); + + // chunk-K residency: annotate the BASE plan (per-layer granularity) and + // offload its first-K segments' params once. Compute itself proceeds on + // the MERGED plan (`plan`) for fused-graph efficiency; offload_partial_params + // filters out resident tensors so the merged-segment offload skips them. + // + // Disabled when a runtime weight_adapter (LoRA in at-runtime mode) is + // attached: chunk-K snapshots the CPU weight data once and freezes the + // GPU resident copy. A runtime adapter modifies the weights between + // calls (and even between sampling steps), which leaves the resident + // GPU copy stale — observable as garbage output on later batch images. + // Our state_token hashes tensor *pointers*, not data, so it can't + // detect the modification; the safe behaviour is to release any prior + // resident set and skip the rebuild. Async prefetch on the streaming + // path still engages, so the perf hit is bounded. + if (weight_adapter != nullptr) { + restore_resident_params(); + } else { + sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; + if (base_plan.available) { + // Reserve headroom for the active merged-segment's params + // (the running compute holds them on GPU alongside chunk-K). + // Without this, chunk-K can grow until the largest merged + // segment's offload OOMs. Find the worst-case merged segment + // and subtract its param weight from the chunk-K budget. + size_t largest_merged_params = 0; + for (const auto& seg : plan.segments) { + if (seg.input_param_bytes > largest_merged_params) { + largest_merged_params = seg.input_param_bytes; + } + } + const size_t chunk_k_budget = + (max_graph_vram_bytes > largest_merged_params) + ? (max_graph_vram_bytes - largest_merged_params) + : 0; + sd::ggml_graph_cut::annotate_residency(base_plan, chunk_k_budget); + + std::vector resident_params; + uint64_t token = 0; + for (const auto& segment : base_plan.segments) { + if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) { + continue; + } + auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment); + for (ggml_tensor* t : seg_params) { + if (t == nullptr) continue; + resident_params.push_back(t); + token ^= reinterpret_cast(t) * 0x9E3779B97F4A7C15ull; + } + } + if (token != resident_state_token) { + if (resident_state_token != 0) { + LOG_DEBUG("%s chunk-K: base-plan residency changed, rebuilding resident set", + get_desc().c_str()); + } + restore_resident_params(); + if (!resident_params.empty()) { + if (offload_resident_params(resident_params)) { + resident_state_token = token; + } else { + LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming", + get_desc().c_str()); + restore_resident_params(); + } + } + } + } + } + + free_compute_buffer(); + free_cache_ctx_and_buffer(); + + // Load any tensors not associated with a segment (a "_global" group + // holds the head/tail params). Move to GPU once for the full run. + layer_registry_.move_layer_to_gpu("_global"); + + // Async prefetch is currently disabled. The path is still implemented + // (compute_streaming_segments_prefetch below) and was the source of + // the original 26% wallclock win on the no-LoRA path, but two + // correctness problems force it off for now: + // + // * Multi-LoRA workloads: graph_compute_async + per-segment + // pending offload races MultiLoraAdapter's per-layer patch_weight + // reads, producing colored static noise on both batch images. + // * batch_count > 1 combined with non-default --guidance AND + // --flow-shift: the smaller merged segments required to fit two + // prefetched buffers + compute buffer in --max-vram accumulate + // enough FP error across the extra boundary-cache roundtrips to + // collapse the diffusion output to pure white frames. + // + // Until both are properly fixed (most likely by porting PR #1477's + // chunk_graph.hpp — see unified_streaming_future_optimizations memory + // note), the synchronous segment loop below is the only validated + // streaming path. chunk-K residency still engages, which preserves + // most of the per-step H2D savings. + const bool prefetch_enabled = false; + + if (prefetch_enabled) { + return compute_streaming_segments_prefetch(gf, plan, n_threads, no_return); + } + + std::optional> output = sd::Tensor(); + for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { + int64_t t_segment_begin = ggml_time_ms(); + const auto& segment = plan.segments[seg_idx]; + const bool is_last = seg_idx + 1 == plan.segments.size(); + auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); + + LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)", + get_desc().c_str(), + seg_idx + 1, + plan.segments.size(), + segment.group_name.c_str(), + segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); + + // Move this segment's layer group onto GPU (skipped for resident + // groups already loaded). Registry is currently unpopulated at + // runtime, so this is a harmless no-op until chunk-K residency + // lands in a later task. + if (!layer_registry_.move_layer_to_gpu(segment.group_name)) { + LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)", + get_desc().c_str(), + segment.group_name.c_str()); + } + + reset_segment_runtime_tensors(segment, gf); + if (!bind_segment_cached_inputs(gf, segment)) { + free_cache_ctx_and_buffer(); + free_compute_buffer(); + free_compute_ctx(); + return std::nullopt; + } + + if (!is_last) { + for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) { + ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); + if (out_tensor != nullptr && + sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) && + future_cut_names.find(out_tensor->name) != future_cut_names.end()) { + cache(out_tensor->name, out_tensor); + } + } + } + + ggml_context* segment_graph_ctx = nullptr; + ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); + auto segment_output = execute_graph(segment_graph, + n_threads, + /*free_compute_buffer_immediately=*/true, + sd::ggml_graph_cut::runtime_param_tensors(gf, segment, get_desc().c_str()), + /*preserve_backend_tensor_data_map=*/true, + /*no_return=*/!is_last || no_return, + &future_cut_names); + ggml_free(segment_graph_ctx); + if (!segment_output.has_value()) { + free_cache_ctx_and_buffer(); + free_compute_buffer(); + free_compute_ctx(); + return std::nullopt; + } + output = std::move(segment_output); + + // Evict this segment's layer group when streaming. Resident groups + // stay loaded for the next sampling step. + if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) { + layer_registry_.move_layer_to_cpu(segment.group_name); + } + (void)t_segment_begin; + } + + backend_tensor_data_map.clear(); + free_cache_ctx_and_buffer(); + free_compute_ctx(); + return output; + } + +private: + // Async-prefetch streaming executor. Identical semantics to the + // synchronous compute_streaming_segments() but interleaves the next + // segment's H2D copies with the current segment's GPU compute. Requires + // the planner budget to have been shrunk (see resolve_graph_cut_plan) + // so that two segments' partial-offload buffers + the compute buffer + // fit inside max_graph_vram_bytes. + template + std::optional> compute_streaming_segments_prefetch(ggml_cgraph* gf, + const GraphCutPlan& plan, + int n_threads, + bool no_return) { + std::optional> output = sd::Tensor(); + std::vector current_params; + std::vector next_params; + + // Pre-offload first segment synchronously so we can begin compute. + current_params = sd::ggml_graph_cut::runtime_param_tensors(gf, plan.segments[0], get_desc().c_str()); + if (!offload_partial_params(current_params)) { + LOG_ERROR("%s prefetch: initial partial offload failed", get_desc().c_str()); + free_cache_ctx_and_buffer(); + free_compute_buffer(); + free_compute_ctx(); + return std::nullopt; + } + + auto bail = [this]() -> std::optional> { + discard_pending_params(); + free_cache_ctx_and_buffer(); + free_compute_buffer(); + free_compute_ctx(); + return std::nullopt; + }; + + for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { + const auto& segment = plan.segments[seg_idx]; + const bool is_last = seg_idx + 1 == plan.segments.size(); + auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); + + LOG_DEBUG("%s prefetch executing segment %zu/%zu: %s (residency=%s)", + get_desc().c_str(), + seg_idx + 1, + plan.segments.size(), + segment.group_name.c_str(), + segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); + + if (!layer_registry_.move_layer_to_gpu(segment.group_name)) { + LOG_DEBUG("%s prefetch: no registry entry for group '%s'", + get_desc().c_str(), segment.group_name.c_str()); + } + + reset_segment_runtime_tensors(segment, gf); + if (!bind_segment_cached_inputs(gf, segment)) { + return bail(); + } + + if (!is_last) { + for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) { + ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); + if (out_tensor != nullptr && + sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) && + future_cut_names.find(out_tensor->name) != future_cut_names.end()) { + cache(out_tensor->name, out_tensor); + } + } + } + + ggml_context* segment_graph_ctx = nullptr; + ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); + + // Build / alloc compute buffer for this segment. + if (!alloc_compute_buffer(segment_graph)) { + LOG_ERROR("%s prefetch: alloc_compute_buffer failed", get_desc().c_str()); + ggml_free(segment_graph_ctx); + return bail(); + } + if (!ggml_gallocr_alloc_graph(compute_allocr, segment_graph)) { + LOG_ERROR("%s prefetch: alloc_graph failed", get_desc().c_str()); + ggml_free(segment_graph_ctx); + return bail(); + } + copy_data_to_backend_tensor(segment_graph, /*clear_after_copy=*/false); + if (ggml_backend_is_cpu(runtime_backend)) { + ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); + } + + // ASYNC compute. Returns once the work is enqueued on the + // compute stream; H2D issued in offload_pending_params() below + // will land on cudaStreamPerThread (independent of compute). + ggml_status status = ggml_backend_graph_compute_async(runtime_backend, segment_graph); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s prefetch: graph_compute_async failed: %s", + get_desc().c_str(), + ggml_status_to_string(status)); + ggml_backend_synchronize(runtime_backend); + ggml_free(segment_graph_ctx); + return bail(); + } + + // Issue prefetch for the NEXT segment while compute runs. + bool prefetched_next = false; + if (!is_last) { + next_params = sd::ggml_graph_cut::runtime_param_tensors(gf, + plan.segments[seg_idx + 1], + get_desc().c_str()); + if (!offload_pending_params(next_params)) { + LOG_ERROR("%s prefetch: pending offload failed at segment %zu", + get_desc().c_str(), seg_idx + 1); + ggml_backend_synchronize(runtime_backend); + ggml_free(segment_graph_ctx); + return bail(); + } + prefetched_next = !pending_offload_pairs.empty(); + } + + // Wait for compute (and the just-issued H2D) to finish. + ggml_backend_synchronize(runtime_backend); + + // Read output for the LAST segment. + if (is_last && !no_return) { + auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); + if (result != nullptr) { + output = read_graph_tensor(result, "output"); + if (!output.has_value()) { + ggml_free(segment_graph_ctx); + return bail(); + } + } else { + output = sd::Tensor(); + } + } + + // Cache outputs that future segments will consume. + if (!copy_cache_tensors_to_cache_buffer(&future_cut_names)) { + ggml_free(segment_graph_ctx); + return bail(); + } + + ggml_free(segment_graph_ctx); + + // Free the just-used compute buffer so the next segment's + // allocator can allocate fresh. This also tears down compute_allocr. + if (compute_allocr != nullptr) { + ggml_gallocr_free(compute_allocr); + compute_allocr = nullptr; + } + + // Restore current segment's partial params (frees its GPU buffer). + restore_partial_params(); + + // Promote prefetched next-segment params to active partial state. + if (prefetched_next) { + promote_pending_to_partial(); + } + + if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) { + layer_registry_.move_layer_to_cpu(segment.group_name); + } + } + + backend_tensor_data_map.clear(); + free_cache_ctx_and_buffer(); + free_compute_ctx(); + return output; + } + +public: + virtual std::string get_desc() = 0; GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend) @@ -2638,9 +3358,11 @@ struct GGMLRunner { GGML_ASSERT(runtime_backend != nullptr); GGML_ASSERT(params_backend != nullptr); alloc_params_ctx(); + layer_registry_.set_backends(runtime_backend, params_backend); } virtual ~GGMLRunner() { + restore_resident_params(); free_params_buffer(); free_compute_buffer(); free_params_ctx(); @@ -2713,6 +3435,11 @@ struct GGMLRunner { } void free_params_buffer() { + // If chunk-K residency is active, our swap pointers reference the + // params backend buffer. Tear it down BEFORE freeing the params + // buffer so the swapped-back tensors don't end up with dangling + // pointers to a freed buffer. + restore_resident_params(); if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; @@ -2736,6 +3463,7 @@ struct GGMLRunner { ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } + discard_pending_params(); restore_partial_params(); restore_all_params(); } @@ -2817,6 +3545,13 @@ struct GGMLRunner { return std::nullopt; } if (should_use_graph_cut_segmented_compute(plan)) { + if (stream_layers_enabled) { + return compute_streaming_segments(gf, + plan, + n_threads, + free_compute_buffer_immediately, + no_return); + } return compute_with_graph_cuts(gf, plan, n_threads, @@ -2857,6 +3592,12 @@ struct GGMLRunner { max_graph_vram_bytes = max_vram_bytes; } + void set_stream_layers_enabled(bool enabled) { + stream_layers_enabled = enabled; + } + + sd::layer_registry::Registry& get_layer_registry() { return layer_registry_; } + ggml_backend_t get_runtime_backend() { return runtime_backend; } diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp index a20de30c9..13570408f 100644 --- a/src/ggml_graph_cut.cpp +++ b/src/ggml_graph_cut.cpp @@ -753,4 +753,58 @@ namespace sd::ggml_graph_cut { return resolved_plan; } + void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) { + // Reset any RESIDENT marks from a prior call before considering the + // new budget. The plan is cached across compute() invocations and + // the budget changes per-call (free-VRAM clamp, runner state, etc.); + // without this reset, segments marked RESIDENT under a larger budget + // would persist when the budget shrinks. + for (auto& seg : plan.segments) { + seg.residency = SegmentResidency::STREAMED; + } + if (max_graph_vram_bytes == 0 || plan.segments.size() < 2) { + return; + } + + // Sanity: skip if no segment carries any params. + bool any_param_bearing = false; + for (const auto& seg : plan.segments) { + if (seg.input_param_bytes > 0) { + any_param_bearing = true; + break; + } + } + if (!any_param_bearing) { + return; + } + + // Headroom budget: + // - safety: minimum free VRAM (512 MB) + // - compute_buffer_reserve: peak compute buffer for one streamed segment + // Async prefetch is not yet implemented in the streaming executor, so we + // do not reserve an additional segment-sized prefetch window. Once Task 11 + // (async prefetch) lands, add `prefetch_segments * largest_streamed_segment` + // back here. + constexpr size_t safety = 512ull * 1024 * 1024; + constexpr size_t compute_buffer_reserve = 768ull * 1024 * 1024; + const size_t reserved = safety + compute_buffer_reserve; + + if (max_graph_vram_bytes <= reserved) { + return; + } + const size_t available = max_graph_vram_bytes - reserved; + + // Greedily mark segments RESIDENT from the start until we exceed the + // available budget. Honors heterogeneous segment sizes (small prelude + + // larger transformer layers) better than dividing by a single estimate. + size_t cumulative = 0; + for (auto& seg : plan.segments) { + if (cumulative + seg.input_param_bytes > available) { + break; + } + seg.residency = SegmentResidency::RESIDENT; + cumulative += seg.input_param_bytes; + } + } + } // namespace sd::ggml_graph_cut diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 9e5a1f755..32f5d908c 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -167,6 +167,7 @@ class StableDiffusionGGML { sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr}; bool offload_params_to_cpu = false; float max_vram = 0.f; + bool stream_layers = false; bool use_pmid = false; std::string backend_spec; std::string params_backend_spec; @@ -210,9 +211,14 @@ class StableDiffusionGGML { bool init_backend(const sd_ctx_params_t* sd_ctx_params) { std::string error; + // Use the member offload_params_to_cpu (not sd_ctx_params->...) so the + // auto-enable in init() (when --stream-layers is set without explicit + // params placement) actually reaches backend_manager. Otherwise the + // log says "enabling --offload-to-cpu implicitly" but params still + // get allocated on the runtime backend. if (!backend_manager.init(sd_ctx_params->backend, sd_ctx_params->params_backend, - sd_ctx_params->offload_params_to_cpu, + offload_params_to_cpu, sd_ctx_params->keep_clip_on_cpu, sd_ctx_params->keep_vae_on_cpu, sd_ctx_params->keep_control_net_on_cpu, @@ -239,8 +245,25 @@ class StableDiffusionGGML { free_params_immediately = sd_ctx_params->free_params_immediately; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; max_vram = sd_ctx_params->max_vram; + stream_layers = sd_ctx_params->stream_layers; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); + if (stream_layers && max_vram == 0.f) { + // max_vram == 0 means "disabled"; negative values mean + // "auto-detect (free VRAM minus |max_vram| GiB)" and are + // resolved later in ggml_graph_cut::resolve_auto_max_vram_bytes. + LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring"); + stream_layers = false; + } + if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) { + // Streaming needs weights somewhere they can stream FROM — i.e. + // params on CPU. Without --offload-to-cpu or an explicit + // --params-backend, weights would live on GPU, which makes + // streaming a silent no-op (params_backend == runtime_backend + // short-circuits should_use_graph_cut_segmented_compute). + LOG_INFO("--stream-layers requires CPU-resident weights; enabling --offload-to-cpu implicitly"); + offload_params_to_cpu = true; + } bool use_tae = false; bool use_audio_vae = false; @@ -419,7 +442,18 @@ class StableDiffusionGGML { } } } - if (have_quantized_weight) { + // Immediate mode bakes LoRA into weights at load time by running + // a forward pass over EVERY weight tensor. That allocates a full- + // model-sized compute buffer (~11 GB for Z-Image bf16) on the + // runtime backend. On VRAM-constrained setups (--offload-to-cpu + // and/or --stream-layers, i.e. the whole reason streaming exists), + // that allocation OOMs — so AUTO must pick runtime there instead. + // Runtime mode is correct but slower because chunk-K residency + // is skipped while a weight_adapter is attached (see + // compute_streaming_segments). + const bool streaming_constrained = sd_ctx_params->stream_layers || + sd_ctx_params->offload_params_to_cpu; + if (have_quantized_weight || streaming_constrained) { apply_lora_immediately = false; } else { apply_lora_immediately = true; @@ -705,6 +739,7 @@ class StableDiffusionGGML { get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE)); diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); + diffusion_model->set_stream_layers_enabled(stream_layers); get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); if (sd_version_is_unet_edit(version)) { @@ -713,6 +748,7 @@ class StableDiffusionGGML { if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); + high_noise_diffusion_model->set_stream_layers_enabled(stream_layers); get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); } @@ -2277,6 +2313,21 @@ class StableDiffusionGGML { } sd::Tensor decode_first_stage(const sd::Tensor& x, bool decode_video = false) { + // Release diffusion's chunk-K resident set before VAE decode. With + // --stream-layers, the diffusion runner keeps several GB resident + // across sampling steps to amortize H2D. VAE decode happens once at + // the end, so holding diffusion's resident set during decode just + // starves VAE of VRAM for its compute buffer (which can be several + // GB at full image resolution). Eviction frees that VRAM for VAE + // and is paid back at the next generation's first sampling step. + if (stream_layers) { + if (diffusion_model) { + diffusion_model->release_streaming_residency(); + } + if (high_noise_diffusion_model) { + high_noise_diffusion_model->release_streaming_residency(); + } + } auto latents = first_stage_model->diffusion_to_vae_latents(x); first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); @@ -2592,6 +2643,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->max_vram = 0.f; + sd_ctx_params->stream_layers = false; sd_ctx_params->enable_mmap = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; @@ -2638,6 +2690,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "prediction: %s\n" "offload_params_to_cpu: %s\n" "max_vram: %.3f\n" + "stream_layers: %s\n" "backend: %s\n" "params_backend: %s\n" "keep_clip_on_cpu: %s\n" @@ -2675,6 +2728,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), sd_ctx_params->max_vram, + BOOL_STR(sd_ctx_params->stream_layers), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), From 5cd49ca72793b9e37567af5db1ed905a5b7ee455 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Fri, 29 May 2026 14:34:01 +0200 Subject: [PATCH 03/11] Snapshot+restore persistent EXTERNAL inputs so runtime LoRA scales correctly --- src/ggml_extend.hpp | 63 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 093220a85..de3e2f587 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2704,8 +2704,38 @@ struct GGMLRunner { return true; } + struct PersistentExternalBinding { + ggml_backend_buffer_t buffer = nullptr; + void* data = nullptr; + void* extra = nullptr; + }; + + void snapshot_persistent_externals(const sd::ggml_graph_cut::Plan& plan, + ggml_cgraph* gf, + std::unordered_map& out) { + GGML_ASSERT(gf != nullptr); + out.clear(); + for (const auto& segment : plan.segments) { + for (const auto& input : segment.input_refs) { + if (input.type != GraphCutSegment::INPUT_EXTERNAL) { + continue; + } + ggml_tensor* tensor = sd::ggml_graph_cut::input_tensor(gf, input); + if (tensor == nullptr || tensor->buffer == nullptr) { + continue; + } + PersistentExternalBinding binding; + binding.buffer = tensor->buffer; + binding.data = tensor->data; + binding.extra = tensor->extra; + out[tensor] = binding; + } + } + } + void reset_segment_runtime_tensors(const GraphCutSegment& segment, - ggml_cgraph* gf) { + ggml_cgraph* gf, + const std::unordered_map* persistent_externals = nullptr) { GGML_ASSERT(gf != nullptr); for (const auto& input : segment.input_refs) { @@ -2715,11 +2745,25 @@ struct GGMLRunner { } switch (input.type) { case GraphCutSegment::INPUT_PREVIOUS_CUT: - case GraphCutSegment::INPUT_EXTERNAL: input_tensor->buffer = nullptr; input_tensor->data = nullptr; input_tensor->extra = nullptr; break; + case GraphCutSegment::INPUT_EXTERNAL: { + if (persistent_externals != nullptr) { + auto it = persistent_externals->find(input_tensor); + if (it != persistent_externals->end()) { + input_tensor->buffer = it->second.buffer; + input_tensor->data = it->second.data; + input_tensor->extra = it->second.extra; + break; + } + } + input_tensor->buffer = nullptr; + input_tensor->data = nullptr; + input_tensor->extra = nullptr; + break; + } case GraphCutSegment::INPUT_PARAM: break; } @@ -2934,6 +2978,9 @@ struct GGMLRunner { free_compute_buffer(); free_cache_ctx_and_buffer(); + std::unordered_map persistent_externals; + snapshot_persistent_externals(plan, gf, persistent_externals); + std::optional> output = sd::Tensor(); for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { int64_t t_segment_begin = ggml_time_ms(); @@ -2945,7 +2992,7 @@ struct GGMLRunner { plan.segments.size(), segment.group_name.c_str()); - reset_segment_runtime_tensors(segment, gf); + reset_segment_runtime_tensors(segment, gf, &persistent_externals); if (!bind_segment_cached_inputs(gf, segment)) { free_cache_ctx_and_buffer(); free_compute_buffer(); @@ -3107,6 +3154,9 @@ struct GGMLRunner { return compute_streaming_segments_prefetch(gf, plan, n_threads, no_return); } + std::unordered_map persistent_externals; + snapshot_persistent_externals(plan, gf, persistent_externals); + std::optional> output = sd::Tensor(); for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { int64_t t_segment_begin = ggml_time_ms(); @@ -3131,7 +3181,7 @@ struct GGMLRunner { segment.group_name.c_str()); } - reset_segment_runtime_tensors(segment, gf); + reset_segment_runtime_tensors(segment, gf, &persistent_externals); if (!bind_segment_cached_inputs(gf, segment)) { free_cache_ctx_and_buffer(); free_compute_buffer(); @@ -3216,6 +3266,9 @@ struct GGMLRunner { return std::nullopt; }; + std::unordered_map persistent_externals; + snapshot_persistent_externals(plan, gf, persistent_externals); + for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { const auto& segment = plan.segments[seg_idx]; const bool is_last = seg_idx + 1 == plan.segments.size(); @@ -3233,7 +3286,7 @@ struct GGMLRunner { get_desc().c_str(), segment.group_name.c_str()); } - reset_segment_runtime_tensors(segment, gf); + reset_segment_runtime_tensors(segment, gf, &persistent_externals); if (!bind_segment_cached_inputs(gf, segment)) { return bail(); } From 94be2a79f8808a8c43562e5fc98f163006f949a2 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sat, 30 May 2026 22:47:48 +0200 Subject: [PATCH 04/11] Warn-and-disable --stream-layers when --offload-to-cpu missing --- src/stable-diffusion.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 501b22f22..4546094ad 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -256,13 +256,11 @@ class StableDiffusionGGML { stream_layers = false; } if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) { - // Streaming needs weights somewhere they can stream FROM — i.e. - // params on CPU. Without --offload-to-cpu or an explicit - // --params-backend, weights would live on GPU, which makes - // streaming a silent no-op (params_backend == runtime_backend - // short-circuits should_use_graph_cut_segmented_compute). - LOG_INFO("--stream-layers requires CPU-resident weights; enabling --offload-to-cpu implicitly"); - offload_params_to_cpu = true; + // Streaming needs weights on CPU to stream from. Mirror the + // --max-vram-missing branch above: warn and disable instead of + // silently flipping the caller's offload-to-cpu setting. + LOG_WARN("--stream-layers has no effect without --offload-to-cpu (or --params-backend); ignoring"); + stream_layers = false; } bool use_tae = false; From fc9ddeeacde30bca5f342c634d65acdfe26e1be2 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sat, 30 May 2026 22:47:49 +0200 Subject: [PATCH 05/11] Rename Registry to LayerRegistry for clarity at call sites --- src/ggml_extend.hpp | 4 ++-- src/layer_registry.cpp | 10 +++++----- src/layer_registry.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index de3e2f587..3b2f59583 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1750,7 +1750,7 @@ struct GGMLRunner { size_t max_graph_vram_bytes = 0; bool stream_layers_enabled = false; - sd::layer_registry::Registry layer_registry_; + sd::layer_registry::LayerRegistry layer_registry_; std::shared_ptr weight_adapter = nullptr; @@ -3649,7 +3649,7 @@ struct GGMLRunner { stream_layers_enabled = enabled; } - sd::layer_registry::Registry& get_layer_registry() { return layer_registry_; } + sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; } ggml_backend_t get_runtime_backend() { return runtime_backend; diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp index 9b809d7b2..78ece2b91 100644 --- a/src/layer_registry.cpp +++ b/src/layer_registry.cpp @@ -6,13 +6,13 @@ namespace sd::layer_registry { -void Registry::register_layer(const std::string& name, ggml_tensor* tensor) { +void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) { auto& info = layers_[name]; info.tensors.push_back(tensor); info.bytes += ggml_nbytes(tensor); } -bool Registry::move_layer_to_gpu(const std::string& name) { +bool LayerRegistry::move_layer_to_gpu(const std::string& name) { auto it = layers_.find(name); if (it == layers_.end()) return false; @@ -80,7 +80,7 @@ bool Registry::move_layer_to_gpu(const std::string& name) { return true; } -bool Registry::move_layer_to_cpu(const std::string& name) { +bool LayerRegistry::move_layer_to_cpu(const std::string& name) { auto it = layers_.find(name); if (it == layers_.end()) return false; @@ -114,12 +114,12 @@ bool Registry::move_layer_to_cpu(const std::string& name) { return true; } -bool Registry::is_layer_on_gpu(const std::string& name) const { +bool LayerRegistry::is_layer_on_gpu(const std::string& name) const { auto it = layers_.find(name); return it != layers_.end() && it->second.on_gpu; } -size_t Registry::get_layer_size(const std::string& name) const { +size_t LayerRegistry::get_layer_size(const std::string& name) const { auto it = layers_.find(name); return it != layers_.end() ? it->second.bytes : 0; } diff --git a/src/layer_registry.h b/src/layer_registry.h index d8b572db3..6727130e4 100644 --- a/src/layer_registry.h +++ b/src/layer_registry.h @@ -20,10 +20,10 @@ struct LayerInfo { size_t bytes = 0; }; -class Registry { +class LayerRegistry { public: - Registry() = default; - Registry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) + LayerRegistry() = default; + LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {} void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) { From 06b19946ac9423fa846c7f85681801df7c11dbf4 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sun, 31 May 2026 20:45:01 +0200 Subject: [PATCH 06/11] Model active segment footprint in annotate_residency --- src/ggml_graph_cut.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp index 13570408f..77dba0b93 100644 --- a/src/ggml_graph_cut.cpp +++ b/src/ggml_graph_cut.cpp @@ -778,16 +778,23 @@ namespace sd::ggml_graph_cut { return; } - // Headroom budget: - // - safety: minimum free VRAM (512 MB) - // - compute_buffer_reserve: peak compute buffer for one streamed segment - // Async prefetch is not yet implemented in the streaming executor, so we - // do not reserve an additional segment-sized prefetch window. Once Task 11 - // (async prefetch) lands, add `prefetch_segments * largest_streamed_segment` - // back here. - constexpr size_t safety = 512ull * 1024 * 1024; - constexpr size_t compute_buffer_reserve = 768ull * 1024 * 1024; - const size_t reserved = safety + compute_buffer_reserve; + // Model the active streamed segment's real footprint instead of a + // fixed reserve: params + compute buffer + output + boundary-cut + // inputs. The largest such footprint over the plan is the headroom + // the resident set must leave free. + size_t worst_streamed_footprint = 0; + for (const auto& seg : plan.segments) { + const size_t seg_footprint = seg.input_param_bytes + + seg.compute_buffer_size + + seg.output_bytes + + seg.input_previous_cut_bytes + + seg.input_external_bytes; + if (seg_footprint > worst_streamed_footprint) { + worst_streamed_footprint = seg_footprint; + } + } + constexpr size_t safety = 512ull * 1024 * 1024; + const size_t reserved = safety + worst_streamed_footprint; if (max_graph_vram_bytes <= reserved) { return; From 8c337ab472533d89113ddbbbc2e75fc07726e509 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sun, 31 May 2026 20:45:02 +0200 Subject: [PATCH 07/11] Drop async prefetch path and consolidate residency annotation --- src/ggml_extend.hpp | 463 ++------------------------------------------ 1 file changed, 18 insertions(+), 445 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 3b2f59583..d85f3480c 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1727,20 +1727,10 @@ struct GGMLRunner { ggml_backend_buffer_t partial_runtime_params_buffer = nullptr; std::vector> partial_offload_pairs; - // Async-prefetch staging: a parallel partial-offload track used by the - // streaming executor to issue the NEXT segment's H2D copies while the - // CURRENT segment is computing. After the current segment's compute is - // done and its partial state is torn down, the pending_* members are - // promoted into partial_* via promote_pending_to_partial(). - ggml_context* pending_offload_ctx = nullptr; - ggml_backend_buffer_t pending_runtime_params_buffer = nullptr; - std::vector> pending_offload_pairs; - - // chunk-K residency track: holds params that stay GPU-resident across - // multiple compute_streaming_segments() invocations within a generation. - // Coexists with the partial_* per-segment track; offload_partial_params() - // filters out anything already in resident_param_set so its - // dup-copy-swap dance doesn't tear down the resident buffer-swap. + // chunk-K residency: GPU-resident params shared across streaming + // segments. offload_partial_params() skips tensors in + // resident_param_set so the per-segment offload track does not tear + // down the resident buffer-swap. ggml_context* resident_offload_ctx = nullptr; std::vector> resident_offload_pairs; ggml_backend_buffer_t resident_runtime_params_buffer = nullptr; @@ -1828,10 +1818,6 @@ struct GGMLRunner { ggml_free(partial_offload_ctx); partial_offload_ctx = nullptr; } - if (pending_offload_ctx != nullptr) { - ggml_free(pending_offload_ctx); - pending_offload_ctx = nullptr; - } } void alloc_cache_ctx() { @@ -2347,157 +2333,9 @@ struct GGMLRunner { } } - // Issue an H2D copy for the next segment's params WITHOUT performing - // a buffer-swap on the source tensors yet. The dup-tensors hold the GPU - // data ready to swap in. Unlike offload_partial_params, this does NOT - // call restore_partial_params() — the current partial state must remain - // intact so the in-flight compute is unaffected. - // - // The H2D goes through ggml_backend_tensor_copy() which for CUDA hits - // cudaStreamPerThread; compute runs on cuda_ctx->stream() (a separate - // non-blocking stream), so the two should overlap on-device. - bool offload_pending_params(const std::vector& tensors) { - if (params_backend == runtime_backend) { - return true; - } - if (tensors.empty()) { - return true; - } - GGML_ASSERT(pending_offload_ctx == nullptr); - GGML_ASSERT(pending_runtime_params_buffer == nullptr); - GGML_ASSERT(pending_offload_pairs.empty()); - - std::vector unique_tensors; - std::unordered_set seen_tensors; - unique_tensors.reserve(tensors.size()); - seen_tensors.reserve(tensors.size()); - for (ggml_tensor* tensor : tensors) { - if (tensor == nullptr) { - continue; - } - if (resident_param_set.find(tensor) != resident_param_set.end()) { - continue; - } - if (seen_tensors.insert(tensor).second) { - unique_tensors.push_back(tensor); - } - } - if (unique_tensors.empty()) { - return true; - } - - ggml_init_params params; - params.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); - params.mem_buffer = nullptr; - params.no_alloc = true; - - pending_offload_ctx = ggml_init(params); - GGML_ASSERT(pending_offload_ctx != nullptr); - - pending_offload_pairs.clear(); - pending_offload_pairs.reserve(unique_tensors.size()); - - for (ggml_tensor* tensor : unique_tensors) { - GGML_ASSERT(tensor->view_src == nullptr); - ggml_tensor* dup = ggml_dup_tensor(pending_offload_ctx, tensor); - ggml_set_name(dup, tensor->name); - pending_offload_pairs.push_back({tensor, dup}); - } - - pending_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(pending_offload_ctx, runtime_backend); - if (pending_runtime_params_buffer == nullptr) { - LOG_ERROR("%s alloc pending runtime params backend buffer failed, num_tensors = %zu", - get_desc().c_str(), - pending_offload_pairs.size()); - ggml_free(pending_offload_ctx); - pending_offload_ctx = nullptr; - pending_offload_pairs.clear(); - return false; - } - ggml_backend_buffer_set_usage(pending_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - - // Issue the H2D copies. The src tensors are still pointing at host - // (or params backend) data; the dup tensors are on runtime_backend. - // ggml_backend_tensor_copy(host_src, gpu_dst) calls - // ggml_backend_tensor_set(dst, ...) which for CUDA enqueues on - // cudaStreamPerThread — independent of the compute stream. - for (auto& pair : pending_offload_pairs) { - ggml_tensor* tensor = pair.first; - ggml_tensor* offload_tensor = pair.second; - ggml_backend_tensor_copy(tensor, offload_tensor); - } - - return true; - } - - // Promote a pending (prefetched) offload track into the active partial - // track. Caller MUST have already torn down the previous partial track - // via restore_partial_params(). After this call, partial_* owns the - // pending dup-tensors and the swap into the active source tensors is - // performed here. - void promote_pending_to_partial() { - if (pending_offload_pairs.empty()) { - if (pending_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(pending_runtime_params_buffer); - pending_runtime_params_buffer = nullptr; - } - if (pending_offload_ctx != nullptr) { - ggml_free(pending_offload_ctx); - pending_offload_ctx = nullptr; - } - return; - } - GGML_ASSERT(partial_offload_pairs.empty()); - GGML_ASSERT(partial_offload_ctx == nullptr); - GGML_ASSERT(partial_runtime_params_buffer == nullptr); - - // Swap source tensors to point at the prefetched GPU storage. - for (auto& pair : pending_offload_pairs) { - ggml_tensor* tensor = pair.first; - ggml_tensor* offload_tensor = pair.second; - std::swap(tensor->buffer, offload_tensor->buffer); - std::swap(tensor->data, offload_tensor->data); - std::swap(tensor->extra, offload_tensor->extra); - } - partial_offload_ctx = pending_offload_ctx; - partial_runtime_params_buffer = pending_runtime_params_buffer; - partial_offload_pairs = std::move(pending_offload_pairs); - pending_offload_ctx = nullptr; - pending_runtime_params_buffer = nullptr; - pending_offload_pairs.clear(); - } - - // Discard any in-flight pending offload track without promoting it. - // Used on error paths after offload_pending_params() succeeded but the - // subsequent compute failed. - void discard_pending_params() { - if (pending_offload_pairs.empty()) { - if (pending_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(pending_runtime_params_buffer); - pending_runtime_params_buffer = nullptr; - } - if (pending_offload_ctx != nullptr) { - ggml_free(pending_offload_ctx); - pending_offload_ctx = nullptr; - } - return; - } - if (pending_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(pending_runtime_params_buffer); - pending_runtime_params_buffer = nullptr; - } - pending_offload_pairs.clear(); - if (pending_offload_ctx != nullptr) { - ggml_free(pending_offload_ctx); - pending_offload_ctx = nullptr; - } - } - - // chunk-K: load a set of params onto the runtime backend and keep them - // resident across compute() calls. Caller MUST have torn down any prior - // resident state via restore_resident_params() before calling. By design - // this does NOT call restore_resident_params() itself — the resident - // track persists across compute() invocations. + // chunk-K: load params onto the runtime backend and keep them resident + // across compute() calls. Caller must release any prior resident set + // via restore_resident_params() first. bool offload_resident_params(const std::vector& tensors) { if (params_backend == runtime_backend) { return true; @@ -2645,61 +2483,16 @@ struct GGMLRunner { } } - // Split the effective budget so chunk-K residency and the prefetch - // window (2 segments + compute buffer + safety) coexist: - // effective = chunk_K + 2*segment + reserve, with chunk_K == 2*segment - // => segment = (effective - reserve) / 4 - // => chunk_K = 2 * segment - constexpr size_t reserve = 512ull * 1024 * 1024 + 768ull * 1024 * 1024; - size_t plan_budget = effective_budget; - size_t residency_budget = effective_budget; - if (stream_layers_enabled && effective_budget > 0) { - // Always pass the full effective_budget to the planner. Earlier - // versions of this branch shrank the budget to (effective-reserve)/4 - // so the prefetch path could hold two segments + compute buffer - // simultaneously and overlap H2D with compute. That worked for - // simple workloads but produced wrong output (white frames / - // colored noise) whenever the smaller-merged-segments combined - // with non-default --guidance or --flow-shift and batch_count > 1 - // (FP error accumulates across the extra boundary-cache - // roundtrips). Runtime LoRA needed the same fall-back for the - // same reason. With the full budget the planner produces the - // walker's large merged segments; async prefetch can't engage - // (no room for two), but chunk-K residency still runs on the - // unmerged base plan and keeps most of the per-step H2D win. - plan_budget = effective_budget; - residency_budget = effective_budget; - } - *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf, &graph_cut_plan_cache_, - plan_budget, + effective_budget, params_tensor_set_, get_desc().c_str()); if (stream_layers_enabled) { - // chunk-K residency lives alongside ONE active merged segment - // at compute time (we no longer prefetch two at once). Reserve - // room for the largest merged segment's params, then let chunk-K - // fill the rest. annotate_residency internally subtracts its own - // safety + compute_buffer pad, so we pass it `effective - largest` - // and it computes available = (effective - largest) - reserve. - size_t largest_segment_params = 0; - for (const auto& seg : plan_out->segments) { - if (seg.input_param_bytes > largest_segment_params) { - largest_segment_params = seg.input_param_bytes; - } - } - const size_t residency_for_annotate = - (residency_budget > largest_segment_params) - ? (residency_budget - largest_segment_params) - : 0; - sd::ggml_graph_cut::annotate_residency(*plan_out, residency_for_annotate); - LOG_INFO("%s streaming budget = %.2f MB (largest merged segment = %.2f MB, chunk-K available = %.2f MB)", + LOG_INFO("%s streaming budget = %.2f MB", get_desc().c_str(), - effective_budget / (1024.0 * 1024.0), - largest_segment_params / (1024.0 * 1024.0), - (residency_for_annotate > reserve ? (residency_for_annotate - reserve) : 0) / (1024.0 * 1024.0)); + effective_budget / (1024.0 * 1024.0)); } return true; } @@ -3057,38 +2850,17 @@ struct GGMLRunner { // chunk-K residency: annotate the BASE plan (per-layer granularity) and // offload its first-K segments' params once. Compute itself proceeds on // the MERGED plan (`plan`) for fused-graph efficiency; offload_partial_params - // filters out resident tensors so the merged-segment offload skips them. - // - // Disabled when a runtime weight_adapter (LoRA in at-runtime mode) is - // attached: chunk-K snapshots the CPU weight data once and freezes the - // GPU resident copy. A runtime adapter modifies the weights between - // calls (and even between sampling steps), which leaves the resident - // GPU copy stale — observable as garbage output on later batch images. - // Our state_token hashes tensor *pointers*, not data, so it can't - // detect the modification; the safe behaviour is to release any prior - // resident set and skip the rebuild. Async prefetch on the streaming - // path still engages, so the perf hit is bounded. + // Runtime weight_adapter (e.g. LoRA in at-runtime mode) mutates the + // CPU weight data between compute calls. The chunk-K resident GPU + // copy is snapshotted once at offload, so it would go stale; the + // state_token only hashes tensor pointers and cannot detect this. + // Release any resident set and skip rebuild in that case. if (weight_adapter != nullptr) { restore_resident_params(); } else { sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; if (base_plan.available) { - // Reserve headroom for the active merged-segment's params - // (the running compute holds them on GPU alongside chunk-K). - // Without this, chunk-K can grow until the largest merged - // segment's offload OOMs. Find the worst-case merged segment - // and subtract its param weight from the chunk-K budget. - size_t largest_merged_params = 0; - for (const auto& seg : plan.segments) { - if (seg.input_param_bytes > largest_merged_params) { - largest_merged_params = seg.input_param_bytes; - } - } - const size_t chunk_k_budget = - (max_graph_vram_bytes > largest_merged_params) - ? (max_graph_vram_bytes - largest_merged_params) - : 0; - sd::ggml_graph_cut::annotate_residency(base_plan, chunk_k_budget); + sd::ggml_graph_cut::annotate_residency(base_plan, max_graph_vram_bytes); std::vector resident_params; uint64_t token = 0; @@ -3104,10 +2876,6 @@ struct GGMLRunner { } } if (token != resident_state_token) { - if (resident_state_token != 0) { - LOG_DEBUG("%s chunk-K: base-plan residency changed, rebuilding resident set", - get_desc().c_str()); - } restore_resident_params(); if (!resident_params.empty()) { if (offload_resident_params(resident_params)) { @@ -3125,35 +2893,10 @@ struct GGMLRunner { free_compute_buffer(); free_cache_ctx_and_buffer(); - // Load any tensors not associated with a segment (a "_global" group - // holds the head/tail params). Move to GPU once for the full run. + // Tensors not associated with any segment (the "_global" group holds + // head/tail params) move to GPU once for the full run. layer_registry_.move_layer_to_gpu("_global"); - // Async prefetch is currently disabled. The path is still implemented - // (compute_streaming_segments_prefetch below) and was the source of - // the original 26% wallclock win on the no-LoRA path, but two - // correctness problems force it off for now: - // - // * Multi-LoRA workloads: graph_compute_async + per-segment - // pending offload races MultiLoraAdapter's per-layer patch_weight - // reads, producing colored static noise on both batch images. - // * batch_count > 1 combined with non-default --guidance AND - // --flow-shift: the smaller merged segments required to fit two - // prefetched buffers + compute buffer in --max-vram accumulate - // enough FP error across the extra boundary-cache roundtrips to - // collapse the diffusion output to pure white frames. - // - // Until both are properly fixed (most likely by porting PR #1477's - // chunk_graph.hpp — see unified_streaming_future_optimizations memory - // note), the synchronous segment loop below is the only validated - // streaming path. chunk-K residency still engages, which preserves - // most of the per-step H2D savings. - const bool prefetch_enabled = false; - - if (prefetch_enabled) { - return compute_streaming_segments_prefetch(gf, plan, n_threads, no_return); - } - std::unordered_map persistent_externals; snapshot_persistent_externals(plan, gf, persistent_externals); @@ -3232,175 +2975,6 @@ struct GGMLRunner { return output; } -private: - // Async-prefetch streaming executor. Identical semantics to the - // synchronous compute_streaming_segments() but interleaves the next - // segment's H2D copies with the current segment's GPU compute. Requires - // the planner budget to have been shrunk (see resolve_graph_cut_plan) - // so that two segments' partial-offload buffers + the compute buffer - // fit inside max_graph_vram_bytes. - template - std::optional> compute_streaming_segments_prefetch(ggml_cgraph* gf, - const GraphCutPlan& plan, - int n_threads, - bool no_return) { - std::optional> output = sd::Tensor(); - std::vector current_params; - std::vector next_params; - - // Pre-offload first segment synchronously so we can begin compute. - current_params = sd::ggml_graph_cut::runtime_param_tensors(gf, plan.segments[0], get_desc().c_str()); - if (!offload_partial_params(current_params)) { - LOG_ERROR("%s prefetch: initial partial offload failed", get_desc().c_str()); - free_cache_ctx_and_buffer(); - free_compute_buffer(); - free_compute_ctx(); - return std::nullopt; - } - - auto bail = [this]() -> std::optional> { - discard_pending_params(); - free_cache_ctx_and_buffer(); - free_compute_buffer(); - free_compute_ctx(); - return std::nullopt; - }; - - std::unordered_map persistent_externals; - snapshot_persistent_externals(plan, gf, persistent_externals); - - for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { - const auto& segment = plan.segments[seg_idx]; - const bool is_last = seg_idx + 1 == plan.segments.size(); - auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); - - LOG_DEBUG("%s prefetch executing segment %zu/%zu: %s (residency=%s)", - get_desc().c_str(), - seg_idx + 1, - plan.segments.size(), - segment.group_name.c_str(), - segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); - - if (!layer_registry_.move_layer_to_gpu(segment.group_name)) { - LOG_DEBUG("%s prefetch: no registry entry for group '%s'", - get_desc().c_str(), segment.group_name.c_str()); - } - - reset_segment_runtime_tensors(segment, gf, &persistent_externals); - if (!bind_segment_cached_inputs(gf, segment)) { - return bail(); - } - - if (!is_last) { - for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) { - ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); - if (out_tensor != nullptr && - sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) && - future_cut_names.find(out_tensor->name) != future_cut_names.end()) { - cache(out_tensor->name, out_tensor); - } - } - } - - ggml_context* segment_graph_ctx = nullptr; - ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); - - // Build / alloc compute buffer for this segment. - if (!alloc_compute_buffer(segment_graph)) { - LOG_ERROR("%s prefetch: alloc_compute_buffer failed", get_desc().c_str()); - ggml_free(segment_graph_ctx); - return bail(); - } - if (!ggml_gallocr_alloc_graph(compute_allocr, segment_graph)) { - LOG_ERROR("%s prefetch: alloc_graph failed", get_desc().c_str()); - ggml_free(segment_graph_ctx); - return bail(); - } - copy_data_to_backend_tensor(segment_graph, /*clear_after_copy=*/false); - if (ggml_backend_is_cpu(runtime_backend)) { - ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); - } - - // ASYNC compute. Returns once the work is enqueued on the - // compute stream; H2D issued in offload_pending_params() below - // will land on cudaStreamPerThread (independent of compute). - ggml_status status = ggml_backend_graph_compute_async(runtime_backend, segment_graph); - if (status != GGML_STATUS_SUCCESS) { - LOG_ERROR("%s prefetch: graph_compute_async failed: %s", - get_desc().c_str(), - ggml_status_to_string(status)); - ggml_backend_synchronize(runtime_backend); - ggml_free(segment_graph_ctx); - return bail(); - } - - // Issue prefetch for the NEXT segment while compute runs. - bool prefetched_next = false; - if (!is_last) { - next_params = sd::ggml_graph_cut::runtime_param_tensors(gf, - plan.segments[seg_idx + 1], - get_desc().c_str()); - if (!offload_pending_params(next_params)) { - LOG_ERROR("%s prefetch: pending offload failed at segment %zu", - get_desc().c_str(), seg_idx + 1); - ggml_backend_synchronize(runtime_backend); - ggml_free(segment_graph_ctx); - return bail(); - } - prefetched_next = !pending_offload_pairs.empty(); - } - - // Wait for compute (and the just-issued H2D) to finish. - ggml_backend_synchronize(runtime_backend); - - // Read output for the LAST segment. - if (is_last && !no_return) { - auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); - if (result != nullptr) { - output = read_graph_tensor(result, "output"); - if (!output.has_value()) { - ggml_free(segment_graph_ctx); - return bail(); - } - } else { - output = sd::Tensor(); - } - } - - // Cache outputs that future segments will consume. - if (!copy_cache_tensors_to_cache_buffer(&future_cut_names)) { - ggml_free(segment_graph_ctx); - return bail(); - } - - ggml_free(segment_graph_ctx); - - // Free the just-used compute buffer so the next segment's - // allocator can allocate fresh. This also tears down compute_allocr. - if (compute_allocr != nullptr) { - ggml_gallocr_free(compute_allocr); - compute_allocr = nullptr; - } - - // Restore current segment's partial params (frees its GPU buffer). - restore_partial_params(); - - // Promote prefetched next-segment params to active partial state. - if (prefetched_next) { - promote_pending_to_partial(); - } - - if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) { - layer_registry_.move_layer_to_cpu(segment.group_name); - } - } - - backend_tensor_data_map.clear(); - free_cache_ctx_and_buffer(); - free_compute_ctx(); - return output; - } - public: virtual std::string get_desc() = 0; @@ -3516,7 +3090,6 @@ struct GGMLRunner { ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } - discard_pending_params(); restore_partial_params(); restore_all_params(); } From 60d00cc35c479af52503212067696a8841f283de Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 2 Jun 2026 21:50:35 +0800 Subject: [PATCH 08/11] use clamped streaming budget for residency --- src/ggml_extend.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index c7f6c540e..987fe3ab3 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2429,7 +2429,8 @@ struct GGMLRunner { } bool resolve_graph_cut_plan(ggml_cgraph* gf, - GraphCutPlan* plan_out) { + GraphCutPlan* plan_out, + size_t* effective_budget_out = nullptr) { GGML_ASSERT(plan_out != nullptr); GGML_ASSERT(gf != nullptr); @@ -2455,6 +2456,10 @@ struct GGMLRunner { } } + if (effective_budget_out != nullptr) { + *effective_budget_out = effective_budget; + } + *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf, &graph_cut_plan_cache_, @@ -2814,6 +2819,7 @@ struct GGMLRunner { template std::optional> compute_streaming_segments(ggml_cgraph* gf, const GraphCutPlan& plan, + size_t residency_budget_bytes, int n_threads, bool free_compute_buffer_immediately, bool no_return = false) { @@ -2832,7 +2838,7 @@ struct GGMLRunner { } else { sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; if (base_plan.available) { - sd::ggml_graph_cut::annotate_residency(base_plan, max_graph_vram_bytes); + sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes); std::vector resident_params; uint64_t token = 0; @@ -3138,7 +3144,8 @@ struct GGMLRunner { if (can_attempt_graph_cut_segmented_compute()) { GraphCutPlan plan; - if (!resolve_graph_cut_plan(gf, &plan)) { + size_t effective_graph_vram_bytes = 0; + if (!resolve_graph_cut_plan(gf, &plan, &effective_graph_vram_bytes)) { free_compute_ctx(); return std::nullopt; } @@ -3146,6 +3153,7 @@ struct GGMLRunner { if (stream_layers_enabled) { return compute_streaming_segments(gf, plan, + effective_graph_vram_bytes, n_threads, free_compute_buffer_immediately, no_return); From d097f86ef2b416a9421cc4812ffa0a6cc7f3d547 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 2 Jun 2026 21:52:47 +0800 Subject: [PATCH 09/11] use validated streaming flag for lora auto mode --- src/stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index b79bbcdc9..1bfba8a32 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -471,7 +471,7 @@ class StableDiffusionGGML { // Runtime mode is correct but slower because chunk-K residency // is skipped while a weight_adapter is attached (see // compute_streaming_segments). - const bool streaming_constrained = sd_ctx_params->stream_layers || + const bool streaming_constrained = stream_layers || sd_ctx_params->offload_params_to_cpu; if (have_quantized_weight || streaming_constrained) { apply_lora_immediately = false; From 30a4c8e1134a96f781fae70b1ec47acbd0436663 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 2 Jun 2026 22:02:21 +0800 Subject: [PATCH 10/11] trim streaming implementation comments --- src/ggml_extend.hpp | 42 ++++++---------------------------------- src/ggml_graph_cut.cpp | 15 ++------------ src/ggml_graph_cut.h | 16 ++------------- src/stable-diffusion.cpp | 25 +++--------------------- 4 files changed, 13 insertions(+), 85 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 987fe3ab3..08a08036f 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1699,10 +1699,7 @@ struct GGMLRunner { ggml_backend_buffer_t partial_runtime_params_buffer = nullptr; std::vector> partial_offload_pairs; - // chunk-K residency: GPU-resident params shared across streaming - // segments. offload_partial_params() skips tensors in - // resident_param_set so the per-segment offload track does not tear - // down the resident buffer-swap. + // Params kept on the runtime backend across streaming segments. ggml_context* resident_offload_ctx = nullptr; std::vector> resident_offload_pairs; ggml_backend_buffer_t resident_runtime_params_buffer = nullptr; @@ -2181,7 +2178,7 @@ struct GGMLRunner { continue; } if (resident_param_set.find(tensor) != resident_param_set.end()) { - continue; // already resident on GPU via the chunk-K track + continue; } if (seen_tensors.insert(tensor).second) { unique_tensors.push_back(tensor); @@ -2305,9 +2302,6 @@ struct GGMLRunner { } } - // chunk-K: load params onto the runtime backend and keep them resident - // across compute() calls. Caller must release any prior resident set - // via restore_resident_params() first. bool offload_resident_params(const std::vector& tensors) { if (params_backend == runtime_backend) { return true; @@ -2434,10 +2428,7 @@ struct GGMLRunner { GGML_ASSERT(plan_out != nullptr); GGML_ASSERT(gf != nullptr); - // Start from the user-set budget. When streaming is on, clamp to actual - // free VRAM at this moment so we don't overcommit when another runner - // (e.g. the LLM in cond_stage) already grabbed a chunk-K resident set - // earlier in the generation. + // Keep the plan and resident params under the same live-VRAM cap. size_t effective_budget = max_graph_vram_bytes; if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) { ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend); @@ -2808,10 +2799,6 @@ struct GGMLRunner { } public: - // Release chunk-K resident params held across compute() calls. Safe to - // call at any point — if no resident state exists this is a no-op. - // stable-diffusion.cpp uses this to evict the diffusion model's resident - // set before VAE decode so VAE has VRAM for its compute buffer. void release_streaming_residency() { restore_resident_params(); } @@ -2825,14 +2812,8 @@ struct GGMLRunner { bool no_return = false) { GGML_ASSERT(gf != nullptr); - // chunk-K residency: annotate the BASE plan (per-layer granularity) and - // offload its first-K segments' params once. Compute itself proceeds on - // the MERGED plan (`plan`) for fused-graph efficiency; offload_partial_params - // Runtime weight_adapter (e.g. LoRA in at-runtime mode) mutates the - // CPU weight data between compute calls. The chunk-K resident GPU - // copy is snapshotted once at offload, so it would go stale; the - // state_token only hashes tensor pointers and cannot detect this. - // Release any resident set and skip rebuild in that case. + // Runtime LoRA mutates CPU weights between calls, so resident GPU + // copies would go stale. if (weight_adapter != nullptr) { restore_resident_params(); } else { @@ -2871,8 +2852,6 @@ struct GGMLRunner { free_compute_buffer(); free_cache_ctx_and_buffer(); - // Tensors not associated with any segment (the "_global" group holds - // head/tail params) move to GPU once for the full run. layer_registry_.move_layer_to_gpu("_global"); std::unordered_map persistent_externals; @@ -2892,10 +2871,6 @@ struct GGMLRunner { segment.group_name.c_str(), segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); - // Move this segment's layer group onto GPU (skipped for resident - // groups already loaded). Registry is currently unpopulated at - // runtime, so this is a harmless no-op until chunk-K residency - // lands in a later task. if (!layer_registry_.move_layer_to_gpu(segment.group_name)) { LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)", get_desc().c_str(), @@ -2939,8 +2914,6 @@ struct GGMLRunner { } output = std::move(segment_output); - // Evict this segment's layer group when streaming. Resident groups - // stay loaded for the next sampling step. if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) { layer_registry_.move_layer_to_cpu(segment.group_name); } @@ -3040,10 +3013,7 @@ struct GGMLRunner { } void free_params_buffer() { - // If chunk-K residency is active, our swap pointers reference the - // params backend buffer. Tear it down BEFORE freeing the params - // buffer so the swapped-back tensors don't end up with dangling - // pointers to a freed buffer. + // Restore swapped resident params before freeing their backing buffer. restore_resident_params(); if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp index 77dba0b93..61234eaf2 100644 --- a/src/ggml_graph_cut.cpp +++ b/src/ggml_graph_cut.cpp @@ -754,11 +754,7 @@ namespace sd::ggml_graph_cut { } void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) { - // Reset any RESIDENT marks from a prior call before considering the - // new budget. The plan is cached across compute() invocations and - // the budget changes per-call (free-VRAM clamp, runner state, etc.); - // without this reset, segments marked RESIDENT under a larger budget - // would persist when the budget shrinks. + // Cached plans may be reused with a smaller live budget. for (auto& seg : plan.segments) { seg.residency = SegmentResidency::STREAMED; } @@ -766,7 +762,6 @@ namespace sd::ggml_graph_cut { return; } - // Sanity: skip if no segment carries any params. bool any_param_bearing = false; for (const auto& seg : plan.segments) { if (seg.input_param_bytes > 0) { @@ -778,10 +773,7 @@ namespace sd::ggml_graph_cut { return; } - // Model the active streamed segment's real footprint instead of a - // fixed reserve: params + compute buffer + output + boundary-cut - // inputs. The largest such footprint over the plan is the headroom - // the resident set must leave free. + // Leave room for the largest active streamed segment. size_t worst_streamed_footprint = 0; for (const auto& seg : plan.segments) { const size_t seg_footprint = seg.input_param_bytes + @@ -801,9 +793,6 @@ namespace sd::ggml_graph_cut { } const size_t available = max_graph_vram_bytes - reserved; - // Greedily mark segments RESIDENT from the start until we exceed the - // available budget. Honors heterogeneous segment sizes (small prelude + - // larger transformer layers) better than dividing by a single estimate. size_t cumulative = 0; for (auto& seg : plan.segments) { if (cumulative + seg.input_param_bytes > available) { diff --git a/src/ggml_graph_cut.h b/src/ggml_graph_cut.h index 7943f48d6..9668bc38e 100644 --- a/src/ggml_graph_cut.h +++ b/src/ggml_graph_cut.h @@ -12,11 +12,7 @@ namespace sd::ggml_graph_cut { - // Whether a segment's params live on the GPU across the entire generation - // (RESIDENT) or get streamed in per sampling step then evicted (STREAMED). - // Only populated when the planner is invoked with stream_layers_enabled=true; - // otherwise every segment is implicitly STREAMED and the existing walker in - // GGMLRunner::compute() handles it (upstream behavior unchanged). + // Streaming residency for a segment's params. enum class SegmentResidency : uint8_t { STREAMED = 0, RESIDENT = 1, @@ -114,15 +110,7 @@ namespace sd::ggml_graph_cut { const std::unordered_set& params_tensor_set, const char* log_desc); - // Annotate the first K segments of plan as RESIDENT, where K is the - // number of leading segments that fit in the residency budget. K is - // determined by: - // K = (max_graph_vram_bytes - prefetch_reserve - safety - // - compute_buffer_reserve) / per_segment_param_bytes, - // clamped to plan.segments.size(). - // - // If max_graph_vram_bytes is 0 or the plan has fewer than 2 segments, - // this is a no-op (no annotation; behavior matches upstream). + // Mark leading segments resident when they fit after streamed-segment headroom. void annotate_residency(Plan& plan, size_t max_graph_vram_bytes); } // namespace sd::ggml_graph_cut diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 1bfba8a32..03aa29005 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -233,11 +233,6 @@ class StableDiffusionGGML { bool init_backend(const sd_ctx_params_t* sd_ctx_params) { std::string error; - // Use the member offload_params_to_cpu (not sd_ctx_params->...) so the - // auto-enable in init() (when --stream-layers is set without explicit - // params placement) actually reaches backend_manager. Otherwise the - // log says "enabling --offload-to-cpu implicitly" but params still - // get allocated on the runtime backend. if (!backend_manager.init(sd_ctx_params->backend, sd_ctx_params->params_backend, offload_params_to_cpu, @@ -271,16 +266,11 @@ class StableDiffusionGGML { backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); if (stream_layers && max_vram == 0.f) { - // max_vram == 0 means "disabled"; negative values mean - // "auto-detect (free VRAM minus |max_vram| GiB)" and are - // resolved later in ggml_graph_cut::resolve_auto_max_vram_bytes. LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring"); stream_layers = false; } if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) { - // Streaming needs weights on CPU to stream from. Mirror the - // --max-vram-missing branch above: warn and disable instead of - // silently flipping the caller's offload-to-cpu setting. + // Streaming needs CPU-resident params. LOG_WARN("--stream-layers has no effect without --offload-to-cpu (or --params-backend); ignoring"); stream_layers = false; } @@ -462,15 +452,7 @@ class StableDiffusionGGML { } } } - // Immediate mode bakes LoRA into weights at load time by running - // a forward pass over EVERY weight tensor. That allocates a full- - // model-sized compute buffer (~11 GB for Z-Image bf16) on the - // runtime backend. On VRAM-constrained setups (--offload-to-cpu - // and/or --stream-layers, i.e. the whole reason streaming exists), - // that allocation OOMs — so AUTO must pick runtime there instead. - // Runtime mode is correct but slower because chunk-K residency - // is skipped while a weight_adapter is attached (see - // compute_streaming_segments). + // Avoid full-model LoRA merge buffers on constrained setups. const bool streaming_constrained = stream_layers || sd_ctx_params->offload_params_to_cpu; if (have_quantized_weight || streaming_constrained) { @@ -2398,8 +2380,7 @@ class StableDiffusionGGML { if (sd_version_is_pid(version)) { return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f); } - // chunk-K residency held across sampling steps starves VAE decode - // of VRAM for its compute buffer; release it before decode. + // Free resident diffusion params before VAE allocates its compute buffer. if (stream_layers) { if (diffusion_model) { diffusion_model->release_streaming_residency(); From 084ccb52291c6f734d1f3c66b734899eabe032f0 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 2 Jun 2026 22:24:06 +0800 Subject: [PATCH 11/11] format code --- src/ggml_extend.hpp | 59 +++++------ src/layer_registry.cpp | 211 ++++++++++++++++++++------------------- src/layer_registry.h | 67 +++++++------ src/stable-diffusion.cpp | 8 +- 4 files changed, 177 insertions(+), 168 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 08a08036f..ef104368a 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1707,7 +1707,7 @@ struct GGMLRunner { uint64_t resident_state_token = 0; size_t max_graph_vram_bytes = 0; - bool stream_layers_enabled = false; + bool stream_layers_enabled = false; sd::layer_registry::LayerRegistry layer_registry_; @@ -2319,16 +2319,19 @@ struct GGMLRunner { unique_tensors.reserve(tensors.size()); seen.reserve(tensors.size()); for (ggml_tensor* t : tensors) { - if (t == nullptr) continue; - if (seen.insert(t).second) unique_tensors.push_back(t); + if (t == nullptr) + continue; + if (seen.insert(t).second) + unique_tensors.push_back(t); } - if (unique_tensors.empty()) return true; + if (unique_tensors.empty()) + return true; ggml_init_params init = {}; - init.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); - init.mem_buffer = nullptr; - init.no_alloc = true; - resident_offload_ctx = ggml_init(init); + init.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); + init.mem_buffer = nullptr; + init.no_alloc = true; + resident_offload_ctx = ggml_init(init); GGML_ASSERT(resident_offload_ctx != nullptr); resident_offload_pairs.reserve(unique_tensors.size()); @@ -2351,12 +2354,12 @@ struct GGMLRunner { ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (auto& pair : resident_offload_pairs) { - ggml_tensor* t = pair.first; + ggml_tensor* t = pair.first; ggml_tensor* twin = pair.second; ggml_backend_tensor_copy(t, twin); std::swap(t->buffer, twin->buffer); - std::swap(t->data, twin->data); - std::swap(t->extra, twin->extra); + std::swap(t->data, twin->data); + std::swap(t->extra, twin->extra); resident_param_set.insert(t); } ggml_backend_synchronize(runtime_backend); @@ -2385,14 +2388,14 @@ struct GGMLRunner { return; } for (auto& pair : resident_offload_pairs) { - ggml_tensor* t = pair.first; + ggml_tensor* t = pair.first; ggml_tensor* twin = pair.second; - t->buffer = twin->buffer; - t->data = twin->data; - t->extra = twin->extra; - twin->buffer = nullptr; - twin->data = nullptr; - twin->extra = nullptr; + t->buffer = twin->buffer; + t->data = twin->data; + t->extra = twin->extra; + twin->buffer = nullptr; + twin->data = nullptr; + twin->extra = nullptr; } if (resident_runtime_params_buffer != nullptr) { ggml_backend_buffer_free(resident_runtime_params_buffer); @@ -2436,7 +2439,7 @@ struct GGMLRunner { size_t free_vram = 0, total_vram = 0; ggml_backend_dev_memory(dev, &free_vram, &total_vram); constexpr size_t safety_margin = 512ull * 1024 * 1024; - size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0; + size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0; if (free_clamp < effective_budget) { LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB", get_desc().c_str(), @@ -2804,12 +2807,12 @@ struct GGMLRunner { } template - std::optional> compute_streaming_segments(ggml_cgraph* gf, + std::optional> compute_streaming_segments(ggml_cgraph* gf, const GraphCutPlan& plan, - size_t residency_budget_bytes, - int n_threads, - bool free_compute_buffer_immediately, - bool no_return = false) { + size_t residency_budget_bytes, + int n_threads, + bool free_compute_buffer_immediately, + bool no_return = false) { GGML_ASSERT(gf != nullptr); // Runtime LoRA mutates CPU weights between calls, so resident GPU @@ -2829,7 +2832,8 @@ struct GGMLRunner { } auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment); for (ggml_tensor* t : seg_params) { - if (t == nullptr) continue; + if (t == nullptr) + continue; resident_params.push_back(t); token ^= reinterpret_cast(t) * 0x9E3779B97F4A7C15ull; } @@ -2861,7 +2865,7 @@ struct GGMLRunner { for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { int64_t t_segment_begin = ggml_time_ms(); const auto& segment = plan.segments[seg_idx]; - const bool is_last = seg_idx + 1 == plan.segments.size(); + const bool is_last = seg_idx + 1 == plan.segments.size(); auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)", @@ -2897,7 +2901,7 @@ struct GGMLRunner { } ggml_context* segment_graph_ctx = nullptr; - ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); + ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); auto segment_output = execute_graph(segment_graph, n_threads, /*free_compute_buffer_immediately=*/true, @@ -2927,7 +2931,6 @@ struct GGMLRunner { } public: - virtual std::string get_desc() = 0; GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend) diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp index 78ece2b91..c4ff18814 100644 --- a/src/layer_registry.cpp +++ b/src/layer_registry.cpp @@ -6,122 +6,127 @@ namespace sd::layer_registry { -void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) { - auto& info = layers_[name]; - info.tensors.push_back(tensor); - info.bytes += ggml_nbytes(tensor); -} - -bool LayerRegistry::move_layer_to_gpu(const std::string& name) { - auto it = layers_.find(name); - if (it == layers_.end()) return false; - - LayerInfo& info = it->second; - if (info.on_gpu) return true; - if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) { - LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU", - name.c_str()); - return false; - } - if (info.tensors.empty()) { - info.on_gpu = true; - return true; + void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) { + auto& info = layers_[name]; + info.tensors.push_back(tensor); + info.bytes += ggml_nbytes(tensor); } - // 1. Build a no_alloc context big enough to hold one twin tensor per CPU - // tensor, plus a little overhead. - const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024; - ggml_init_params ctx_params{ ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true }; - ggml_context* twin_ctx = ggml_init(ctx_params); - if (twin_ctx == nullptr) { - LOG_ERROR("layer_registry: failed to allocate twin context for '%s'", - name.c_str()); - return false; - } + bool LayerRegistry::move_layer_to_gpu(const std::string& name) { + auto it = layers_.find(name); + if (it == layers_.end()) + return false; + + LayerInfo& info = it->second; + if (info.on_gpu) + return true; + if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) { + LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU", + name.c_str()); + return false; + } + if (info.tensors.empty()) { + info.on_gpu = true; + return true; + } - // 2. Create one GPU twin per CPU tensor. The twin shares the original - // name so any name-based lookup keeps working. - std::vector gpu_twins; - gpu_twins.reserve(info.tensors.size()); - for (ggml_tensor* cpu_t : info.tensors) { - ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t); - if (cpu_t->name[0] != '\0') { - ggml_set_name(twin, cpu_t->name); + // 1. Build a no_alloc context big enough to hold one twin tensor per CPU + // tensor, plus a little overhead. + const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024; + ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true}; + ggml_context* twin_ctx = ggml_init(ctx_params); + if (twin_ctx == nullptr) { + LOG_ERROR("layer_registry: failed to allocate twin context for '%s'", + name.c_str()); + return false; } - gpu_twins.push_back(twin); - } - // 3. Back the twins with a GPU buffer in one alloc call. - ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_); - if (gpu_buffer == nullptr) { - LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'", - name.c_str()); - ggml_free(twin_ctx); - return false; - } + // 2. Create one GPU twin per CPU tensor. The twin shares the original + // name so any name-based lookup keeps working. + std::vector gpu_twins; + gpu_twins.reserve(info.tensors.size()); + for (ggml_tensor* cpu_t : info.tensors) { + ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t); + if (cpu_t->name[0] != '\0') { + ggml_set_name(twin, cpu_t->name); + } + gpu_twins.push_back(twin); + } - // 4. H2D copy + sync. - for (size_t i = 0; i < info.tensors.size(); ++i) { - ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]); - } - ggml_backend_synchronize(gpu_backend_); + // 3. Back the twins with a GPU buffer in one alloc call. + ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_); + if (gpu_buffer == nullptr) { + LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'", + name.c_str()); + ggml_free(twin_ctx); + return false; + } - // 5. Swap buffer/data/extra so the originals now point at GPU memory. - for (size_t i = 0; i < info.tensors.size(); ++i) { - std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer); - std::swap(info.tensors[i]->data, gpu_twins[i]->data); - std::swap(info.tensors[i]->extra, gpu_twins[i]->extra); - } + // 4. H2D copy + sync. + for (size_t i = 0; i < info.tensors.size(); ++i) { + ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]); + } + ggml_backend_synchronize(gpu_backend_); - info.gpu_twins = std::move(gpu_twins); - info.twin_ctx = twin_ctx; - info.gpu_buffer = gpu_buffer; - info.on_gpu = true; - return true; -} - -bool LayerRegistry::move_layer_to_cpu(const std::string& name) { - auto it = layers_.find(name); - if (it == layers_.end()) return false; - - LayerInfo& info = it->second; - if (!info.on_gpu) return true; - if (info.tensors.size() != info.gpu_twins.size()) { - LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'", - name.c_str()); - return false; + // 5. Swap buffer/data/extra so the originals now point at GPU memory. + for (size_t i = 0; i < info.tensors.size(); ++i) { + std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer); + std::swap(info.tensors[i]->data, gpu_twins[i]->data); + std::swap(info.tensors[i]->extra, gpu_twins[i]->extra); + } + + info.gpu_twins = std::move(gpu_twins); + info.twin_ctx = twin_ctx; + info.gpu_buffer = gpu_buffer; + info.on_gpu = true; + return true; } - // 1. Swap back: originals point at CPU memory again. - for (size_t i = 0; i < info.tensors.size(); ++i) { - if (info.gpu_twins[i] == nullptr) continue; - std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer); - std::swap(info.tensors[i]->data, info.gpu_twins[i]->data); - std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra); + bool LayerRegistry::move_layer_to_cpu(const std::string& name) { + auto it = layers_.find(name); + if (it == layers_.end()) + return false; + + LayerInfo& info = it->second; + if (!info.on_gpu) + return true; + if (info.tensors.size() != info.gpu_twins.size()) { + LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'", + name.c_str()); + return false; + } + + // 1. Swap back: originals point at CPU memory again. + for (size_t i = 0; i < info.tensors.size(); ++i) { + if (info.gpu_twins[i] == nullptr) + continue; + std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer); + std::swap(info.tensors[i]->data, info.gpu_twins[i]->data); + std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra); + } + + // 2. Free the GPU buffer + twin context. + if (info.gpu_buffer != nullptr) { + ggml_backend_buffer_free(info.gpu_buffer); + info.gpu_buffer = nullptr; + } + if (info.twin_ctx != nullptr) { + ggml_free(info.twin_ctx); + info.twin_ctx = nullptr; + } + info.gpu_twins.clear(); + info.on_gpu = false; + return true; } - // 2. Free the GPU buffer + twin context. - if (info.gpu_buffer != nullptr) { - ggml_backend_buffer_free(info.gpu_buffer); - info.gpu_buffer = nullptr; + bool LayerRegistry::is_layer_on_gpu(const std::string& name) const { + auto it = layers_.find(name); + return it != layers_.end() && it->second.on_gpu; } - if (info.twin_ctx != nullptr) { - ggml_free(info.twin_ctx); - info.twin_ctx = nullptr; + + size_t LayerRegistry::get_layer_size(const std::string& name) const { + auto it = layers_.find(name); + return it != layers_.end() ? it->second.bytes : 0; } - info.gpu_twins.clear(); - info.on_gpu = false; - return true; -} - -bool LayerRegistry::is_layer_on_gpu(const std::string& name) const { - auto it = layers_.find(name); - return it != layers_.end() && it->second.on_gpu; -} - -size_t LayerRegistry::get_layer_size(const std::string& name) const { - auto it = layers_.find(name); - return it != layers_.end() ? it->second.bytes : 0; -} } // namespace sd::layer_registry diff --git a/src/layer_registry.h b/src/layer_registry.h index 6727130e4..2dc755349 100644 --- a/src/layer_registry.h +++ b/src/layer_registry.h @@ -6,43 +6,44 @@ #include #include -#include "ggml.h" #include "ggml-backend.h" +#include "ggml.h" namespace sd::layer_registry { -struct LayerInfo { - std::vector tensors; - std::vector gpu_twins; - ggml_context* twin_ctx = nullptr; - ggml_backend_buffer_t gpu_buffer = nullptr; - bool on_gpu = false; - size_t bytes = 0; -}; - -class LayerRegistry { -public: - LayerRegistry() = default; - LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) - : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {} - - void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) { - gpu_backend_ = gpu_backend; cpu_backend_ = cpu_backend; - } - void register_layer(const std::string& name, ggml_tensor* tensor); - bool move_layer_to_gpu(const std::string& name); - bool move_layer_to_cpu(const std::string& name); - bool is_layer_on_gpu(const std::string& name) const; - size_t get_layer_size(const std::string& name) const; - size_t get_layer_count() const { return layers_.size(); } - - const std::map& layers() const { return layers_; } - -private: - ggml_backend_t gpu_backend_ = nullptr; - ggml_backend_t cpu_backend_ = nullptr; - std::map layers_; -}; + struct LayerInfo { + std::vector tensors; + std::vector gpu_twins; + ggml_context* twin_ctx = nullptr; + ggml_backend_buffer_t gpu_buffer = nullptr; + bool on_gpu = false; + size_t bytes = 0; + }; + + class LayerRegistry { + public: + LayerRegistry() = default; + LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) + : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {} + + void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) { + gpu_backend_ = gpu_backend; + cpu_backend_ = cpu_backend; + } + void register_layer(const std::string& name, ggml_tensor* tensor); + bool move_layer_to_gpu(const std::string& name); + bool move_layer_to_cpu(const std::string& name); + bool is_layer_on_gpu(const std::string& name) const; + size_t get_layer_size(const std::string& name) const; + size_t get_layer_count() const { return layers_.size(); } + + const std::map& layers() const { return layers_; } + + private: + ggml_backend_t gpu_backend_ = nullptr; + ggml_backend_t cpu_backend_ = nullptr; + std::map layers_; + }; } // namespace sd::layer_registry diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 03aa29005..9434fa5ed 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -4192,7 +4192,7 @@ static std::optional prepare_image_generation_embeds(sd_c std::vector> empty_ref_images; condition_params.ref_images = &empty_ref_images; uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, - condition_params); + condition_params); if (uncond.c_concat.empty()) { uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize } @@ -4210,9 +4210,9 @@ static std::optional prepare_image_generation_embeds(sd_c ImageGenerationEmbeds embeds; embeds.img_cond = std::move(img_cond); - embeds.cond = std::move(cond); - embeds.uncond = std::move(uncond); - embeds.id_cond = std::move(id_cond); + embeds.cond = std::move(cond); + embeds.uncond = std::move(uncond); + embeds.id_cond = std::move(id_cond); return embeds; }