Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ enum rng_type_t {

enum sample_method_t {
EULER_SAMPLE_METHOD,
EULER_FLOW_FLASH_SAMPLE_METHOD,
EULER_A_SAMPLE_METHOD,
HEUN_SAMPLE_METHOD,
DPM2_SAMPLE_METHOD,
Expand Down
24 changes: 23 additions & 1 deletion src/conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ struct SDCondition {
sd::Tensor<float> c_concat;
sd::Tensor<int32_t> c_t5_ids;
sd::Tensor<float> c_t5_weights;
sd::Tensor<int32_t> c_input_ids;
sd::Tensor<int32_t> c_position_ids;
sd::Tensor<int32_t> c_token_types;
sd::Tensor<int32_t> c_image_embed_ranges;
sd::Tensor<int32_t> c_vinput_mask;
std::vector<sd::Tensor<float>> c_vlm_images;
std::vector<sd::Tensor<float>> c_ref_images;

std::vector<sd::Tensor<float>> extra_c_crossattns;

Expand All @@ -26,10 +33,25 @@ struct SDCondition {

bool empty() const {
if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() ||
!c_t5_ids.empty() || !c_t5_weights.empty()) {
!c_t5_ids.empty() || !c_t5_weights.empty() ||
!c_input_ids.empty() || !c_position_ids.empty() ||
!c_token_types.empty() || !c_image_embed_ranges.empty() ||
!c_vinput_mask.empty()) {
return false;
}

for (const auto& tensor : c_vlm_images) {
if (!tensor.empty()) {
return false;
}
}

for (const auto& tensor : c_ref_images) {
if (!tensor.empty()) {
return false;
}
}

for (const auto& tensor : extra_c_crossattns) {
if (!tensor.empty()) {
return false;
Expand Down
31 changes: 29 additions & 2 deletions src/denoiser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,31 @@ static sd::Tensor<float> sample_euler_flow(denoise_cb_t model,
return x;
}

static sd::Tensor<float> sample_euler_flow_flash(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
float eta) {
float s_noise = eta;
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
float sigma_next = sigmas[i + 1];
auto denoised_opt = model(x, sigma, i + 1);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt);
if (sigma_next == 0.0f) {
x = std::move(denoised);
continue;
}
auto noise = sd::Tensor<float>::randn_like(x, rng);
x = sigma_next * noise * s_noise + (1.0f - sigma_next) * denoised;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. This is basically the same as the LCM sampler, but using eta as an additional noise multiplier. Maybe the two implementations could be merged?

}
return x;
}

static sd::Tensor<float> sample_euler(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas) {
Expand Down Expand Up @@ -1289,8 +1314,8 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
}
sd::Tensor<float> denoised = std::move(denoised_opt);

float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];

auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);

Expand Down Expand Up @@ -1658,6 +1683,8 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
float eta,
bool is_flow_denoiser) {
switch (method) {
case EULER_FLOW_FLASH_SAMPLE_METHOD:
return sample_euler_flow_flash(model, std::move(x), sigmas, rng, eta);
case EULER_A_SAMPLE_METHOD:
if (is_flow_denoiser)
return sample_euler_flow(model, std::move(x), sigmas, rng, eta);
Expand Down
84 changes: 84 additions & 0 deletions src/diffusion_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "anima.hpp"
#include "ernie_image.hpp"
#include "flux.hpp"
#include "hidream_o1.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
#include "tensor_ggml.hpp"
Expand All @@ -22,6 +23,12 @@ struct DiffusionParams {
const sd::Tensor<float>* t5_weights = nullptr;
const sd::Tensor<float>* guidance = nullptr;
const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
const sd::Tensor<int32_t>* input_ids = nullptr;
const sd::Tensor<int32_t>* input_pos = nullptr;
const sd::Tensor<int32_t>* token_types = nullptr;
const sd::Tensor<int32_t>* image_embed_ranges = nullptr;
const sd::Tensor<int32_t>* vinput_mask = nullptr;
const std::vector<sd::Tensor<float>>* vlm_images = nullptr;
bool increase_ref_index = false;
int num_video_frames = -1;
const std::vector<sd::Tensor<float>>* controls = nullptr;
Expand Down Expand Up @@ -476,6 +483,83 @@ struct QwenImageModel : public DiffusionModel {
}
};

struct HiDreamO1Model : public DiffusionModel {
std::string prefix;
HiDreamO1::HiDreamO1Runner hidream_o1;

HiDreamO1Model(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
}

std::string get_desc() override {
return hidream_o1.get_desc();
}

void alloc_params_buffer() override {
hidream_o1.alloc_params_buffer();
}

void free_params_buffer() override {
hidream_o1.free_params_buffer();
}

void free_compute_buffer() override {
hidream_o1.free_compute_buffer();
}

void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
hidream_o1.get_param_tensors(tensors, prefix);
}

size_t get_params_buffer_size() override {
return hidream_o1.get_params_buffer_size();
}

void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
hidream_o1.set_weight_adapter(adapter);
}

int64_t get_adm_in_channels() override {
return 0;
}

void set_flash_attention_enabled(bool enabled) {
hidream_o1.set_flash_attention_enabled(enabled);
}

void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
hidream_o1.set_max_graph_vram_bytes(max_vram_bytes);
}

void set_circular_axes(bool circular_x, bool circular_y) override {
hidream_o1.set_circular_axes(circular_x, circular_y);
}

sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
GGML_ASSERT(diffusion_params.input_ids != nullptr);
GGML_ASSERT(diffusion_params.input_pos != nullptr);
GGML_ASSERT(diffusion_params.token_types != nullptr);
static const sd::Tensor<int32_t> empty_image_embed_ranges;
static const std::vector<sd::Tensor<float>> empty_images;
return hidream_o1.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
*diffusion_params.input_ids,
*diffusion_params.input_pos,
*diffusion_params.token_types,
diffusion_params.image_embed_ranges ? *diffusion_params.image_embed_ranges : empty_image_embed_ranges,
diffusion_params.vinput_mask ? *diffusion_params.vinput_mask : empty_image_embed_ranges,
diffusion_params.vlm_images ? *diffusion_params.vlm_images : empty_images,
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
}
};

struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
Expand Down
95 changes: 88 additions & 7 deletions src/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1698,13 +1698,41 @@ struct WeightAdapter {
};

struct GGMLRunnerContext {
ggml_backend_t backend = nullptr;
ggml_context* ggml_ctx = nullptr;
bool flash_attn_enabled = false;
bool conv2d_direct_enabled = false;
bool circular_x_enabled = false;
bool circular_y_enabled = false;
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
ggml_backend_t backend = nullptr;
ggml_context* ggml_ctx = nullptr;
bool flash_attn_enabled = false;
bool conv2d_direct_enabled = false;
bool circular_x_enabled = false;
bool circular_y_enabled = false;
std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
std::vector<std::pair<ggml_tensor*, std::string>>* debug_tensors = nullptr;
std::function<ggml_tensor*(const std::string&)> get_cache_tensor;
std::function<void(const std::string&, ggml_tensor*)> cache_tensor;

void capture_tensor(const std::string& name, ggml_tensor* tensor) {
if (debug_tensors == nullptr || tensor == nullptr) {
return;
}
ggml_tensor* snapshot = ggml_cont(ggml_ctx, tensor);
ggml_tensor* dst = ggml_dup_tensor(ggml_ctx, snapshot);
snapshot = ggml_cpy(ggml_ctx, snapshot, dst);
ggml_set_output(snapshot);
debug_tensors->push_back({snapshot, name});
}

ggml_tensor* load_cache_tensor(const std::string& name) const {
if (!get_cache_tensor) {
return nullptr;
}
return get_cache_tensor(name);
}

void persist_cache_tensor(const std::string& name, ggml_tensor* tensor) const {
if (!cache_tensor || tensor == nullptr) {
return;
}
cache_tensor(name, tensor);
}
};

struct GGMLRunner {
Expand Down Expand Up @@ -1743,6 +1771,7 @@ struct GGMLRunner {

std::map<ggml_tensor*, const void*> backend_tensor_data_map;
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
std::vector<std::pair<ggml_tensor*, std::string>> debug_tensors;
const std::string final_result_name = "ggml_runner_final_result_tensor";

bool flash_attn_enabled = false;
Expand Down Expand Up @@ -1838,6 +1867,7 @@ struct GGMLRunner {
}

void free_compute_ctx() {
debug_tensors.clear();
if (compute_ctx != nullptr) {
ggml_free(compute_ctx);
compute_ctx = nullptr;
Expand Down Expand Up @@ -1884,6 +1914,16 @@ struct GGMLRunner {
auto result = ggml_graph_node(gf, -1);
ggml_set_name(result, final_result_name.c_str());
}
for (const auto& entry : debug_tensors) {
if (entry.first != nullptr) {
ggml_build_forward_expand(gf, entry.first);
}
}
for (const auto& entry : cache_tensor_map) {
if (entry.second != nullptr) {
ggml_build_forward_expand(gf, entry.second);
}
}
prepare_build_in_tensor_after(gf);
return gf;
}
Expand Down Expand Up @@ -2031,6 +2071,21 @@ struct GGMLRunner {
for (auto& kv : backend_tensor_data_map) {
auto tensor = kv.first;
auto data = kv.second;
if (tensor == nullptr || data == nullptr) {
continue;
}
const char* name = ggml_get_name(tensor);
if (tensor->buffer == nullptr) {
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
get_desc().c_str(),
name != nullptr ? name : "",
(long long)tensor->ne[0],
(long long)tensor->ne[1],
(long long)tensor->ne[2],
(long long)tensor->ne[3],
ggml_type_name(tensor->type));
continue;
}

if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) {
continue;
Expand Down Expand Up @@ -2421,6 +2476,22 @@ struct GGMLRunner {
return std::nullopt;
}

for (const auto& entry : debug_tensors) {
auto tensor = entry.first;
if (tensor == nullptr) {
continue;
}
if (tensor->type != GGML_TYPE_F32) {
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
get_desc().c_str(),
entry.second.c_str(),
ggml_type_name(tensor->type));
continue;
}
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
print_sd_tensor(debug_tensor, false, entry.second.c_str());
}

int64_t t_cache_begin = ggml_time_ms();
if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) {
if (free_compute_buffer_immediately) {
Expand Down Expand Up @@ -2557,6 +2628,13 @@ struct GGMLRunner {
runner_ctx.circular_x_enabled = circular_x_enabled;
runner_ctx.circular_y_enabled = circular_y_enabled;
runner_ctx.weight_adapter = weight_adapter;
runner_ctx.debug_tensors = &debug_tensors;
runner_ctx.get_cache_tensor = [this](const std::string& name) {
return this->get_cache_tensor_by_name(name);
};
runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) {
this->cache(name, tensor);
};
return runner_ctx;
}

Expand Down Expand Up @@ -2659,6 +2737,9 @@ struct GGMLRunner {
}

void cache(const std::string name, ggml_tensor* tensor) {
if (tensor != nullptr && tensor->view_src != nullptr) {
tensor = ggml_cont(compute_ctx, tensor);
}
cache_tensor_map[name] = tensor;
}

Expand Down
Loading
Loading