diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 673699fde..0ecc72dc6 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -438,6 +438,10 @@ ArgOptions SDContextParams::get_options() { }; options.bool_options = { + {"", + "--stream-layers", + "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", + true, &stream_layers}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -720,6 +724,7 @@ std::string SDContextParams::to_string() const { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" + << " stream_layers: " << (stream_layers ? "true" : "false") << ",\n" << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" @@ -800,6 +805,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f qwen_image_zero_cond_t, str_to_vae_format(vae_format), max_vram, + stream_layers, backend.c_str(), params_backend.c_str(), }; diff --git a/examples/common/common.h b/examples/common/common.h index 6266f0abe..cd02d212a 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -113,6 +113,7 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; float max_vram = 0.f; + bool stream_layers = false; std::string backend; std::string params_backend; bool enable_mmap = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index e15f1aa7d..8654c01d8 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -222,6 +222,7 @@ typedef struct { bool qwen_image_zero_cond_t; enum sd_vae_format_t vae_format; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) + bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) const char* backend; const char* params_backend; } sd_ctx_params_t; diff --git a/src/conditioner.hpp b/src/conditioner.hpp index bb4e1c267..157d3906b 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -118,6 +118,7 @@ struct Conditioner { virtual void get_param_tensors(std::map& tensors) = 0; virtual size_t get_params_buffer_size() = 0; virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} + virtual void set_stream_layers_enabled(bool enabled) {} virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} virtual std::tuple> get_learned_condition_with_trigger(int n_threads, @@ -210,6 +211,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + text_model->set_stream_layers_enabled(enabled); + if (sd_version_is_sdxl(version)) { + text_model2->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { text_model->set_flash_attention_enabled(enabled); if (sd_version_is_sdxl(version)) { @@ -843,6 +851,18 @@ struct SD3CLIPEmbedder : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + if (clip_l) { + clip_l->set_stream_layers_enabled(enabled); + } + if (clip_g) { + clip_g->set_stream_layers_enabled(enabled); + } + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { if (clip_l) { clip_l->set_flash_attention_enabled(enabled); @@ -1200,6 +1220,15 @@ struct FluxCLIPEmbedder : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + if (clip_l) { + clip_l->set_stream_layers_enabled(enabled); + } + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { if (clip_l) { clip_l->set_flash_attention_enabled(enabled); @@ -1434,6 +1463,12 @@ struct T5CLIPEmbedder : public Conditioner { } } + void set_stream_layers_enabled(bool enabled) override { + if (t5) { + t5->set_stream_layers_enabled(enabled); + } + } + void set_flash_attention_enabled(bool enabled) override { if (t5) { t5->set_flash_attention_enabled(enabled); @@ -1617,6 +1652,10 @@ struct AnimaConditioner : public Conditioner { llm->set_max_graph_vram_bytes(max_vram_bytes); } + void set_stream_layers_enabled(bool enabled) override { + llm->set_stream_layers_enabled(enabled); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } @@ -1765,6 +1804,10 @@ struct LLMEmbedder : public Conditioner { llm->set_max_graph_vram_bytes(max_vram_bytes); } + void set_stream_layers_enabled(bool enabled) override { + llm->set_stream_layers_enabled(enabled); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index b018852a3..ef104368a 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -28,6 +28,7 @@ #include "ggml.h" #include "ggml_extend_backend.h" #include "ggml_graph_cut.h" +#include "layer_registry.h" #include "model.h" #include "tensor.hpp" @@ -1697,7 +1698,18 @@ struct GGMLRunner { ggml_context* partial_offload_ctx = nullptr; ggml_backend_buffer_t partial_runtime_params_buffer = nullptr; std::vector> partial_offload_pairs; + + // Params kept on the runtime backend across streaming segments. + ggml_context* resident_offload_ctx = nullptr; + std::vector> resident_offload_pairs; + ggml_backend_buffer_t resident_runtime_params_buffer = nullptr; + std::unordered_set resident_param_set; + uint64_t resident_state_token = 0; + size_t max_graph_vram_bytes = 0; + bool stream_layers_enabled = false; + + sd::layer_registry::LayerRegistry layer_registry_; std::shared_ptr weight_adapter = nullptr; @@ -2165,6 +2177,9 @@ struct GGMLRunner { if (tensor == nullptr) { continue; } + if (resident_param_set.find(tensor) != resident_param_set.end()) { + continue; + } if (seen_tensors.insert(tensor).second) { unique_tensors.push_back(tensor); } @@ -2287,6 +2302,114 @@ struct GGMLRunner { } } + bool offload_resident_params(const std::vector& tensors) { + if (params_backend == runtime_backend) { + return true; + } + if (tensors.empty()) { + return true; + } + GGML_ASSERT(resident_runtime_params_buffer == nullptr); + GGML_ASSERT(resident_offload_ctx == nullptr); + GGML_ASSERT(resident_offload_pairs.empty()); + GGML_ASSERT(resident_param_set.empty()); + + std::vector unique_tensors; + std::unordered_set seen; + unique_tensors.reserve(tensors.size()); + seen.reserve(tensors.size()); + for (ggml_tensor* t : tensors) { + if (t == nullptr) + continue; + if (seen.insert(t).second) + unique_tensors.push_back(t); + } + if (unique_tensors.empty()) + return true; + + ggml_init_params init = {}; + init.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); + init.mem_buffer = nullptr; + init.no_alloc = true; + resident_offload_ctx = ggml_init(init); + GGML_ASSERT(resident_offload_ctx != nullptr); + + resident_offload_pairs.reserve(unique_tensors.size()); + for (ggml_tensor* t : unique_tensors) { + GGML_ASSERT(t->view_src == nullptr); + ggml_tensor* twin = ggml_dup_tensor(resident_offload_ctx, t); + ggml_set_name(twin, t->name); + resident_offload_pairs.push_back({t, twin}); + } + + resident_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(resident_offload_ctx, runtime_backend); + if (resident_runtime_params_buffer == nullptr) { + LOG_ERROR("%s alloc resident runtime params backend buffer failed, num_tensors = %zu", + get_desc().c_str(), resident_offload_pairs.size()); + ggml_free(resident_offload_ctx); + resident_offload_ctx = nullptr; + resident_offload_pairs.clear(); + return false; + } + ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + for (auto& pair : resident_offload_pairs) { + ggml_tensor* t = pair.first; + ggml_tensor* twin = pair.second; + ggml_backend_tensor_copy(t, twin); + std::swap(t->buffer, twin->buffer); + std::swap(t->data, twin->data); + std::swap(t->extra, twin->extra); + resident_param_set.insert(t); + } + ggml_backend_synchronize(runtime_backend); + + size_t sz = ggml_backend_buffer_get_size(resident_runtime_params_buffer); + LOG_INFO("%s offload resident params (%6.2f MB, %zu tensors) to runtime backend (%s)", + get_desc().c_str(), + sz / (1024.f * 1024.f), + resident_offload_pairs.size(), + ggml_backend_name(runtime_backend)); + return true; + } + + void restore_resident_params() { + if (resident_offload_pairs.empty()) { + if (resident_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(resident_runtime_params_buffer); + resident_runtime_params_buffer = nullptr; + } + if (resident_offload_ctx != nullptr) { + ggml_free(resident_offload_ctx); + resident_offload_ctx = nullptr; + } + resident_param_set.clear(); + resident_state_token = 0; + return; + } + for (auto& pair : resident_offload_pairs) { + ggml_tensor* t = pair.first; + ggml_tensor* twin = pair.second; + t->buffer = twin->buffer; + t->data = twin->data; + t->extra = twin->extra; + twin->buffer = nullptr; + twin->data = nullptr; + twin->extra = nullptr; + } + if (resident_runtime_params_buffer != nullptr) { + ggml_backend_buffer_free(resident_runtime_params_buffer); + resident_runtime_params_buffer = nullptr; + } + resident_offload_pairs.clear(); + if (resident_offload_ctx != nullptr) { + ggml_free(resident_offload_ctx); + resident_offload_ctx = nullptr; + } + resident_param_set.clear(); + resident_state_token = 0; + } + bool should_use_graph_cut_segmented_compute(const GraphCutPlan& plan) { return plan.has_cuts && plan.valid && @@ -2303,20 +2426,80 @@ struct GGMLRunner { } bool resolve_graph_cut_plan(ggml_cgraph* gf, - GraphCutPlan* plan_out) { + GraphCutPlan* plan_out, + size_t* effective_budget_out = nullptr) { GGML_ASSERT(plan_out != nullptr); GGML_ASSERT(gf != nullptr); + + // Keep the plan and resident params under the same live-VRAM cap. + size_t effective_budget = max_graph_vram_bytes; + if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) { + ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend); + if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + size_t free_vram = 0, total_vram = 0; + ggml_backend_dev_memory(dev, &free_vram, &total_vram); + constexpr size_t safety_margin = 512ull * 1024 * 1024; + size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0; + if (free_clamp < effective_budget) { + LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB", + get_desc().c_str(), + free_clamp / (1024.0 * 1024.0), + effective_budget / (1024.0 * 1024.0)); + effective_budget = free_clamp; + } + } + } + + if (effective_budget_out != nullptr) { + *effective_budget_out = effective_budget; + } + *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf, &graph_cut_plan_cache_, - max_graph_vram_bytes, + effective_budget, params_tensor_set_, get_desc().c_str()); + if (stream_layers_enabled) { + LOG_INFO("%s streaming budget = %.2f MB", + get_desc().c_str(), + effective_budget / (1024.0 * 1024.0)); + } return true; } + struct PersistentExternalBinding { + ggml_backend_buffer_t buffer = nullptr; + void* data = nullptr; + void* extra = nullptr; + }; + + void snapshot_persistent_externals(const sd::ggml_graph_cut::Plan& plan, + ggml_cgraph* gf, + std::unordered_map& out) { + GGML_ASSERT(gf != nullptr); + out.clear(); + for (const auto& segment : plan.segments) { + for (const auto& input : segment.input_refs) { + if (input.type != GraphCutSegment::INPUT_EXTERNAL) { + continue; + } + ggml_tensor* tensor = sd::ggml_graph_cut::input_tensor(gf, input); + if (tensor == nullptr || tensor->buffer == nullptr) { + continue; + } + PersistentExternalBinding binding; + binding.buffer = tensor->buffer; + binding.data = tensor->data; + binding.extra = tensor->extra; + out[tensor] = binding; + } + } + } + void reset_segment_runtime_tensors(const GraphCutSegment& segment, - ggml_cgraph* gf) { + ggml_cgraph* gf, + const std::unordered_map* persistent_externals = nullptr) { GGML_ASSERT(gf != nullptr); for (const auto& input : segment.input_refs) { @@ -2326,11 +2509,25 @@ struct GGMLRunner { } switch (input.type) { case GraphCutSegment::INPUT_PREVIOUS_CUT: - case GraphCutSegment::INPUT_EXTERNAL: input_tensor->buffer = nullptr; input_tensor->data = nullptr; input_tensor->extra = nullptr; break; + case GraphCutSegment::INPUT_EXTERNAL: { + if (persistent_externals != nullptr) { + auto it = persistent_externals->find(input_tensor); + if (it != persistent_externals->end()) { + input_tensor->buffer = it->second.buffer; + input_tensor->data = it->second.data; + input_tensor->extra = it->second.extra; + break; + } + } + input_tensor->buffer = nullptr; + input_tensor->data = nullptr; + input_tensor->extra = nullptr; + break; + } case GraphCutSegment::INPUT_PARAM: break; } @@ -2545,6 +2742,9 @@ struct GGMLRunner { free_compute_buffer(); free_cache_ctx_and_buffer(); + std::unordered_map persistent_externals; + snapshot_persistent_externals(plan, gf, persistent_externals); + std::optional> output = sd::Tensor(); for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { int64_t t_segment_begin = ggml_time_ms(); @@ -2556,7 +2756,7 @@ struct GGMLRunner { plan.segments.size(), segment.group_name.c_str()); - reset_segment_runtime_tensors(segment, gf); + reset_segment_runtime_tensors(segment, gf, &persistent_externals); if (!bind_segment_cached_inputs(gf, segment)) { free_cache_ctx_and_buffer(); free_compute_buffer(); @@ -2601,6 +2801,135 @@ struct GGMLRunner { return output; } +public: + void release_streaming_residency() { + restore_resident_params(); + } + + template + std::optional> compute_streaming_segments(ggml_cgraph* gf, + const GraphCutPlan& plan, + size_t residency_budget_bytes, + int n_threads, + bool free_compute_buffer_immediately, + bool no_return = false) { + GGML_ASSERT(gf != nullptr); + + // Runtime LoRA mutates CPU weights between calls, so resident GPU + // copies would go stale. + if (weight_adapter != nullptr) { + restore_resident_params(); + } else { + sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; + if (base_plan.available) { + sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes); + + std::vector resident_params; + uint64_t token = 0; + for (const auto& segment : base_plan.segments) { + if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) { + continue; + } + auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment); + for (ggml_tensor* t : seg_params) { + if (t == nullptr) + continue; + resident_params.push_back(t); + token ^= reinterpret_cast(t) * 0x9E3779B97F4A7C15ull; + } + } + if (token != resident_state_token) { + restore_resident_params(); + if (!resident_params.empty()) { + if (offload_resident_params(resident_params)) { + resident_state_token = token; + } else { + LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming", + get_desc().c_str()); + restore_resident_params(); + } + } + } + } + } + + free_compute_buffer(); + free_cache_ctx_and_buffer(); + + layer_registry_.move_layer_to_gpu("_global"); + + std::unordered_map persistent_externals; + snapshot_persistent_externals(plan, gf, persistent_externals); + + std::optional> output = sd::Tensor(); + for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { + int64_t t_segment_begin = ggml_time_ms(); + const auto& segment = plan.segments[seg_idx]; + const bool is_last = seg_idx + 1 == plan.segments.size(); + auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); + + LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)", + get_desc().c_str(), + seg_idx + 1, + plan.segments.size(), + segment.group_name.c_str(), + segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); + + if (!layer_registry_.move_layer_to_gpu(segment.group_name)) { + LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)", + get_desc().c_str(), + segment.group_name.c_str()); + } + + reset_segment_runtime_tensors(segment, gf, &persistent_externals); + if (!bind_segment_cached_inputs(gf, segment)) { + free_cache_ctx_and_buffer(); + free_compute_buffer(); + free_compute_ctx(); + return std::nullopt; + } + + if (!is_last) { + for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) { + ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); + if (out_tensor != nullptr && + sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) && + future_cut_names.find(out_tensor->name) != future_cut_names.end()) { + cache(out_tensor->name, out_tensor); + } + } + } + + ggml_context* segment_graph_ctx = nullptr; + ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); + auto segment_output = execute_graph(segment_graph, + n_threads, + /*free_compute_buffer_immediately=*/true, + sd::ggml_graph_cut::runtime_param_tensors(gf, segment, get_desc().c_str()), + /*preserve_backend_tensor_data_map=*/true, + /*no_return=*/!is_last || no_return, + &future_cut_names); + ggml_free(segment_graph_ctx); + if (!segment_output.has_value()) { + free_cache_ctx_and_buffer(); + free_compute_buffer(); + free_compute_ctx(); + return std::nullopt; + } + output = std::move(segment_output); + + if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) { + layer_registry_.move_layer_to_cpu(segment.group_name); + } + (void)t_segment_begin; + } + + backend_tensor_data_map.clear(); + free_cache_ctx_and_buffer(); + free_compute_ctx(); + return output; + } + public: virtual std::string get_desc() = 0; @@ -2610,9 +2939,11 @@ struct GGMLRunner { GGML_ASSERT(runtime_backend != nullptr); GGML_ASSERT(params_backend != nullptr); alloc_params_ctx(); + layer_registry_.set_backends(runtime_backend, params_backend); } virtual ~GGMLRunner() { + restore_resident_params(); free_params_buffer(); free_compute_buffer(); free_params_ctx(); @@ -2685,6 +3016,8 @@ struct GGMLRunner { } void free_params_buffer() { + // Restore swapped resident params before freeing their backing buffer. + restore_resident_params(); if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; @@ -2784,11 +3117,20 @@ struct GGMLRunner { if (can_attempt_graph_cut_segmented_compute()) { GraphCutPlan plan; - if (!resolve_graph_cut_plan(gf, &plan)) { + size_t effective_graph_vram_bytes = 0; + if (!resolve_graph_cut_plan(gf, &plan, &effective_graph_vram_bytes)) { free_compute_ctx(); return std::nullopt; } if (should_use_graph_cut_segmented_compute(plan)) { + if (stream_layers_enabled) { + return compute_streaming_segments(gf, + plan, + effective_graph_vram_bytes, + n_threads, + free_compute_buffer_immediately, + no_return); + } return compute_with_graph_cuts(gf, plan, n_threads, @@ -2829,6 +3171,12 @@ struct GGMLRunner { max_graph_vram_bytes = max_vram_bytes; } + void set_stream_layers_enabled(bool enabled) { + stream_layers_enabled = enabled; + } + + sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; } + ggml_backend_t get_runtime_backend() { return runtime_backend; } diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp index a20de30c9..61234eaf2 100644 --- a/src/ggml_graph_cut.cpp +++ b/src/ggml_graph_cut.cpp @@ -753,4 +753,54 @@ namespace sd::ggml_graph_cut { return resolved_plan; } + void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) { + // Cached plans may be reused with a smaller live budget. + for (auto& seg : plan.segments) { + seg.residency = SegmentResidency::STREAMED; + } + if (max_graph_vram_bytes == 0 || plan.segments.size() < 2) { + return; + } + + bool any_param_bearing = false; + for (const auto& seg : plan.segments) { + if (seg.input_param_bytes > 0) { + any_param_bearing = true; + break; + } + } + if (!any_param_bearing) { + return; + } + + // Leave room for the largest active streamed segment. + size_t worst_streamed_footprint = 0; + for (const auto& seg : plan.segments) { + const size_t seg_footprint = seg.input_param_bytes + + seg.compute_buffer_size + + seg.output_bytes + + seg.input_previous_cut_bytes + + seg.input_external_bytes; + if (seg_footprint > worst_streamed_footprint) { + worst_streamed_footprint = seg_footprint; + } + } + constexpr size_t safety = 512ull * 1024 * 1024; + const size_t reserved = safety + worst_streamed_footprint; + + if (max_graph_vram_bytes <= reserved) { + return; + } + const size_t available = max_graph_vram_bytes - reserved; + + size_t cumulative = 0; + for (auto& seg : plan.segments) { + if (cumulative + seg.input_param_bytes > available) { + break; + } + seg.residency = SegmentResidency::RESIDENT; + cumulative += seg.input_param_bytes; + } + } + } // namespace sd::ggml_graph_cut diff --git a/src/ggml_graph_cut.h b/src/ggml_graph_cut.h index dcd1551a7..9668bc38e 100644 --- a/src/ggml_graph_cut.h +++ b/src/ggml_graph_cut.h @@ -2,6 +2,7 @@ #define __SD_GGML_GRAPH_CUT_H__ #include +#include #include #include #include @@ -11,6 +12,12 @@ namespace sd::ggml_graph_cut { + // Streaming residency for a segment's params. + enum class SegmentResidency : uint8_t { + STREAMED = 0, + RESIDENT = 1, + }; + struct Segment { enum InputType { INPUT_EXTERNAL = 0, @@ -34,6 +41,7 @@ namespace sd::ggml_graph_cut { std::vector internal_node_indices; std::vector output_node_indices; std::vector input_refs; + SegmentResidency residency = SegmentResidency::STREAMED; }; struct Plan { @@ -101,6 +109,9 @@ namespace sd::ggml_graph_cut { size_t max_graph_vram_bytes, const std::unordered_set& params_tensor_set, const char* log_desc); + + // Mark leading segments resident when they fit after streamed-segment headroom. + void annotate_residency(Plan& plan, size_t max_graph_vram_bytes); } // namespace sd::ggml_graph_cut #endif diff --git a/src/layer_registry.cpp b/src/layer_registry.cpp new file mode 100644 index 000000000..c4ff18814 --- /dev/null +++ b/src/layer_registry.cpp @@ -0,0 +1,132 @@ +#include "layer_registry.h" + +#include + +#include "util.h" + +namespace sd::layer_registry { + + void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) { + auto& info = layers_[name]; + info.tensors.push_back(tensor); + info.bytes += ggml_nbytes(tensor); + } + + bool LayerRegistry::move_layer_to_gpu(const std::string& name) { + auto it = layers_.find(name); + if (it == layers_.end()) + return false; + + LayerInfo& info = it->second; + if (info.on_gpu) + return true; + if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) { + LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU", + name.c_str()); + return false; + } + if (info.tensors.empty()) { + info.on_gpu = true; + return true; + } + + // 1. Build a no_alloc context big enough to hold one twin tensor per CPU + // tensor, plus a little overhead. + const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024; + ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true}; + ggml_context* twin_ctx = ggml_init(ctx_params); + if (twin_ctx == nullptr) { + LOG_ERROR("layer_registry: failed to allocate twin context for '%s'", + name.c_str()); + return false; + } + + // 2. Create one GPU twin per CPU tensor. The twin shares the original + // name so any name-based lookup keeps working. + std::vector gpu_twins; + gpu_twins.reserve(info.tensors.size()); + for (ggml_tensor* cpu_t : info.tensors) { + ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t); + if (cpu_t->name[0] != '\0') { + ggml_set_name(twin, cpu_t->name); + } + gpu_twins.push_back(twin); + } + + // 3. Back the twins with a GPU buffer in one alloc call. + ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_); + if (gpu_buffer == nullptr) { + LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'", + name.c_str()); + ggml_free(twin_ctx); + return false; + } + + // 4. H2D copy + sync. + for (size_t i = 0; i < info.tensors.size(); ++i) { + ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]); + } + ggml_backend_synchronize(gpu_backend_); + + // 5. Swap buffer/data/extra so the originals now point at GPU memory. + for (size_t i = 0; i < info.tensors.size(); ++i) { + std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer); + std::swap(info.tensors[i]->data, gpu_twins[i]->data); + std::swap(info.tensors[i]->extra, gpu_twins[i]->extra); + } + + info.gpu_twins = std::move(gpu_twins); + info.twin_ctx = twin_ctx; + info.gpu_buffer = gpu_buffer; + info.on_gpu = true; + return true; + } + + bool LayerRegistry::move_layer_to_cpu(const std::string& name) { + auto it = layers_.find(name); + if (it == layers_.end()) + return false; + + LayerInfo& info = it->second; + if (!info.on_gpu) + return true; + if (info.tensors.size() != info.gpu_twins.size()) { + LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'", + name.c_str()); + return false; + } + + // 1. Swap back: originals point at CPU memory again. + for (size_t i = 0; i < info.tensors.size(); ++i) { + if (info.gpu_twins[i] == nullptr) + continue; + std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer); + std::swap(info.tensors[i]->data, info.gpu_twins[i]->data); + std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra); + } + + // 2. Free the GPU buffer + twin context. + if (info.gpu_buffer != nullptr) { + ggml_backend_buffer_free(info.gpu_buffer); + info.gpu_buffer = nullptr; + } + if (info.twin_ctx != nullptr) { + ggml_free(info.twin_ctx); + info.twin_ctx = nullptr; + } + info.gpu_twins.clear(); + info.on_gpu = false; + return true; + } + + bool LayerRegistry::is_layer_on_gpu(const std::string& name) const { + auto it = layers_.find(name); + return it != layers_.end() && it->second.on_gpu; + } + + size_t LayerRegistry::get_layer_size(const std::string& name) const { + auto it = layers_.find(name); + return it != layers_.end() ? it->second.bytes : 0; + } + +} // namespace sd::layer_registry diff --git a/src/layer_registry.h b/src/layer_registry.h new file mode 100644 index 000000000..2dc755349 --- /dev/null +++ b/src/layer_registry.h @@ -0,0 +1,50 @@ +#ifndef __LAYER_REGISTRY_H__ +#define __LAYER_REGISTRY_H__ + +#include +#include +#include +#include + +#include "ggml-backend.h" +#include "ggml.h" + +namespace sd::layer_registry { + + struct LayerInfo { + std::vector tensors; + std::vector gpu_twins; + ggml_context* twin_ctx = nullptr; + ggml_backend_buffer_t gpu_buffer = nullptr; + bool on_gpu = false; + size_t bytes = 0; + }; + + class LayerRegistry { + public: + LayerRegistry() = default; + LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) + : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {} + + void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) { + gpu_backend_ = gpu_backend; + cpu_backend_ = cpu_backend; + } + void register_layer(const std::string& name, ggml_tensor* tensor); + bool move_layer_to_gpu(const std::string& name); + bool move_layer_to_cpu(const std::string& name); + bool is_layer_on_gpu(const std::string& name) const; + size_t get_layer_size(const std::string& name) const; + size_t get_layer_count() const { return layers_.size(); } + + const std::map& layers() const { return layers_; } + + private: + ggml_backend_t gpu_backend_ = nullptr; + ggml_backend_t cpu_backend_ = nullptr; + std::map layers_; + }; + +} // namespace sd::layer_registry + +#endif diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 239115fe3..9434fa5ed 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -189,6 +189,7 @@ class StableDiffusionGGML { sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr}; bool offload_params_to_cpu = false; float max_vram = 0.f; + bool stream_layers = false; bool use_pmid = false; std::string backend_spec; std::string params_backend_spec; @@ -234,7 +235,7 @@ class StableDiffusionGGML { std::string error; if (!backend_manager.init(sd_ctx_params->backend, sd_ctx_params->params_backend, - sd_ctx_params->offload_params_to_cpu, + offload_params_to_cpu, sd_ctx_params->keep_clip_on_cpu, sd_ctx_params->keep_vae_on_cpu, sd_ctx_params->keep_control_net_on_cpu, @@ -261,8 +262,18 @@ class StableDiffusionGGML { free_params_immediately = sd_ctx_params->free_params_immediately; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; max_vram = sd_ctx_params->max_vram; + stream_layers = sd_ctx_params->stream_layers; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); + if (stream_layers && max_vram == 0.f) { + LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring"); + stream_layers = false; + } + if (stream_layers && !offload_params_to_cpu && params_backend_spec.empty()) { + // Streaming needs CPU-resident params. + LOG_WARN("--stream-layers has no effect without --offload-to-cpu (or --params-backend); ignoring"); + stream_layers = false; + } bool use_tae = false; bool use_audio_vae = false; @@ -441,7 +452,10 @@ class StableDiffusionGGML { } } } - if (have_quantized_weight) { + // Avoid full-model LoRA merge buffers on constrained setups. + const bool streaming_constrained = stream_layers || + sd_ctx_params->offload_params_to_cpu; + if (have_quantized_weight || streaming_constrained) { apply_lora_immediately = false; } else { apply_lora_immediately = true; @@ -737,6 +751,7 @@ class StableDiffusionGGML { get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE)); diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); + diffusion_model->set_stream_layers_enabled(stream_layers); get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); if (sd_version_is_unet_edit(version)) { @@ -745,6 +760,7 @@ class StableDiffusionGGML { if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); + high_noise_diffusion_model->set_stream_layers_enabled(stream_layers); get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); } @@ -2364,6 +2380,15 @@ class StableDiffusionGGML { if (sd_version_is_pid(version)) { return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f); } + // Free resident diffusion params before VAE allocates its compute buffer. + if (stream_layers) { + if (diffusion_model) { + diffusion_model->release_streaming_residency(); + } + if (high_noise_diffusion_model) { + high_noise_diffusion_model->release_streaming_residency(); + } + } auto latents = first_stage_model->diffusion_to_vae_latents(x); first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); @@ -2708,6 +2733,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->max_vram = 0.f; + sd_ctx_params->stream_layers = false; sd_ctx_params->enable_mmap = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; @@ -2755,6 +2781,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "prediction: %s\n" "offload_params_to_cpu: %s\n" "max_vram: %.3f\n" + "stream_layers: %s\n" "backend: %s\n" "params_backend: %s\n" "keep_clip_on_cpu: %s\n" @@ -2793,6 +2820,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), sd_ctx_params->max_vram, + BOOL_STR(sd_ctx_params->stream_layers), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), @@ -4164,7 +4192,7 @@ static std::optional prepare_image_generation_embeds(sd_c std::vector> empty_ref_images; condition_params.ref_images = &empty_ref_images; uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, - condition_params); + condition_params); if (uncond.c_concat.empty()) { uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize } @@ -4182,9 +4210,9 @@ static std::optional prepare_image_generation_embeds(sd_c ImageGenerationEmbeds embeds; embeds.img_cond = std::move(img_cond); - embeds.cond = std::move(cond); - embeds.uncond = std::move(uncond); - embeds.id_cond = std::move(id_cond); + embeds.cond = std::move(cond); + embeds.uncond = std::move(uncond); + embeds.id_cond = std::move(id_cond); return embeds; } diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 1197ce35e..cef26e36d 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -25,6 +25,13 @@ void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) { } } +void UpscalerGGML::set_stream_layers_enabled(bool enabled) { + stream_layers_enabled = enabled; + if (esrgan_upscaler) { + esrgan_upscaler->set_stream_layers_enabled(enabled); + } +} + bool UpscalerGGML::load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, int n_threads) { @@ -76,6 +83,7 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, tile_size, model_loader.get_tensor_storage_map()); esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); + esrgan_upscaler->set_stream_layers_enabled(stream_layers_enabled); if (direct) { esrgan_upscaler->set_conv2d_direct_enabled(true); } diff --git a/src/upscaler.h b/src/upscaler.h index e3967865c..e2197694d 100644 --- a/src/upscaler.h +++ b/src/upscaler.h @@ -18,6 +18,7 @@ struct UpscalerGGML { bool direct = false; int tile_size = 128; size_t max_graph_vram_bytes = 0; + bool stream_layers_enabled = false; std::string backend_spec; std::string params_backend_spec; @@ -31,6 +32,7 @@ struct UpscalerGGML { bool offload_params_to_cpu, int n_threads); void set_max_graph_vram_bytes(size_t max_vram_bytes); + void set_stream_layers_enabled(bool enabled); sd::Tensor upscale_tensor(const sd::Tensor& input_tensor); sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor); };