leejet · leejet · Apr 12, 2026 · Apr 23, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "ggml"]
     path = ggml
-	url = https://github.com/ggml-org/ggml.git
+	url = https://github.com/leejet/ggml.git
 [submodule "examples/server/frontend"]
 	path = examples/server/frontend
 	url = https://github.com/leejet/sdcpp-webui.git

diff --git a/examples/cli/CMakeLists.txt b/examples/cli/CMakeLists.txt
@@ -7,6 +7,10 @@ add_executable(${TARGET}
     image_metadata.cpp
     main.cpp
 )
+target_include_directories(${TARGET} PRIVATE
+    "${CMAKE_CURRENT_SOURCE_DIR}/.."
+    "${PROJECT_SOURCE_DIR}/src"
+)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion zip ${CMAKE_THREAD_LIBS_INIT})
 if(SD_WEBP)

diff --git a/examples/cli/README.md b/examples/cli/README.md
@@ -158,6 +158,7 @@ Generation Options:
   --disable-auto-resize-ref-image          disable auto resize of ref images
   --disable-image-metadata                 do not embed generation metadata on image files
   --vae-tiling                             process vae in tiles to reduce memory usage
+  --temporal-tiling                        enable temporal tiling for LTX video VAE decode
   --hires                                  enable highres fix
   -s, --seed                               RNG seed (default: 42, use random seed for < 0)
   --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -385,11 +385,32 @@ std::string format_frame_idx(std::string pattern, int frame_idx) {
     return result;
 }
 
+static fs::path get_video_audio_sidecar_path(const SDCliParams& cli_params) {
+    fs::path out_path     = cli_params.output_path;
+    fs::path base_path    = out_path;
+    fs::path ext          = out_path.has_extension() ? out_path.extension() : fs::path{};
+    std::string ext_lower = ext.string();
+    std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
+    const EncodedImageFormat output_format = encoded_image_format_from_path(out_path.string());
+    if (!ext.empty()) {
+        if (output_format == EncodedImageFormat::JPEG ||
+            output_format == EncodedImageFormat::PNG ||
+            output_format == EncodedImageFormat::WEBP ||
+            ext_lower == ".avi" ||
+            ext_lower == ".webm") {
+            base_path.replace_extension();
+        }
+    }
+    base_path += ".wav";
+    return base_path;
+}
+
 bool save_results(const SDCliParams& cli_params,
                   const SDContextParams& ctx_params,
                   const SDGenerationParams& gen_params,
                   sd_image_t* results,
-                  int num_results) {
+                  int num_results,
+                  const sd_audio_t* generated_audio = nullptr) {
     if (results == nullptr || num_results <= 0) {
         return false;
     }
@@ -442,6 +463,21 @@ bool save_results(const SDCliParams& cli_params,
         return ok;
     };
 
+    auto write_audio_sidecar = [&](const fs::path& wav_path) {
+        if (generated_audio == nullptr) {
+            return;
+        }
+        if (write_wav_to_file(wav_path.string(),
+                              generated_audio->data,
+                              generated_audio->sample_count,
+                              generated_audio->channels,
+                              generated_audio->sample_rate)) {
+            LOG_INFO("save result audio to '%s'", wav_path.string().c_str());
+        } else {
+            LOG_WARN("failed to save result audio to '%s'", wav_path.string().c_str());
+        }
+    };
+
     int sucessful_reults = 0;
 
     if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
@@ -465,8 +501,16 @@ bool save_results(const SDCliParams& cli_params,
             ext = ".avi";
         fs::path video_path = base_path;
         video_path += ext;
-        if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
+        std::string final_ext_lower = ext.string();
+        std::transform(final_ext_lower.begin(), final_ext_lower.end(), final_ext_lower.begin(), ::tolower);
+        const bool mux_audio = generated_audio != nullptr && (final_ext_lower == ".avi" || final_ext_lower == ".webm");
+        if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps, 90, mux_audio ? generated_audio : nullptr) == 0) {
             LOG_INFO("save result video to '%s'", video_path.string().c_str());
+            if (generated_audio != nullptr && !mux_audio) {
+                fs::path wav_path = video_path;
+                wav_path.replace_extension(".wav");
+                write_audio_sidecar(wav_path);
+            }
             return true;
         } else {
             LOG_ERROR("Failed to save result video to '%s'", video_path.string().c_str());
@@ -488,6 +532,9 @@ bool save_results(const SDCliParams& cli_params,
         }
     }
     LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+    if (generated_audio != nullptr) {
+        write_audio_sidecar(get_video_audio_sidecar_path(cli_params));
+    }
     return sucessful_reults != 0;
 }
 
@@ -701,7 +748,8 @@ int main(int argc, const char* argv[]) {
     sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);
 
     SDImageVec results;
-    int num_results = 0;
+    int num_results             = 0;
+    sd_audio_t* generated_audio = nullptr;
 
     if (cli_params.mode == UPSCALE) {
         num_results = 1;
@@ -733,7 +781,10 @@ int main(int argc, const char* argv[]) {
             results.adopt(generate_image(sd_ctx.get(), &img_gen_params), num_results);
         } else if (cli_params.mode == VID_GEN) {
             sd_vid_gen_params_t vid_gen_params = gen_params.to_sd_vid_gen_params_t();
-            sd_image_t* generated_video        = generate_video(sd_ctx.get(), &vid_gen_params, &num_results);
+            sd_image_t* generated_video        = nullptr;
+            if (!generate_video(sd_ctx.get(), &vid_gen_params, &generated_video, &num_results, &generated_audio)) {
+                generated_video = nullptr;
+            }
             results.adopt(generated_video, num_results);
         }
 
@@ -773,9 +824,12 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results)) {
+    if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results, generated_audio)) {
+        free_sd_audio(generated_audio);
         return 1;
     }
 
+    free_sd_audio(generated_audio);
+
     return 0;
 }
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -340,10 +340,18 @@ ArgOptions SDContextParams::get_options() {
          "--high-noise-diffusion-model",
          "path to the standalone high noise diffusion model",
          &high_noise_diffusion_model_path},
+        {"",
+         "--embeddings-connectors",
+         "path to LTXAV embeddings connectors",
+         &embeddings_connectors_path},
         {"",
          "--vae",
          "path to standalone vae model",
          &vae_path},
+        {"",
+         "--audio-vae",
+         "path to standalone LTX audio vae model",
+         &audio_vae_path},
         {"",
          "--taesd",
          "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
@@ -661,7 +669,9 @@ std::string SDContextParams::to_string() const {
         << "  llm_vision_path: \"" << llm_vision_path << "\",\n"
         << "  diffusion_model_path: \"" << diffusion_model_path << "\",\n"
         << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
+        << "  embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
         << "  vae_path: \"" << vae_path << "\",\n"
+        << "  audio_vae_path: \"" << audio_vae_path << "\",\n"
         << "  taesd_path: \"" << taesd_path << "\",\n"
         << "  esrgan_path: \"" << esrgan_path << "\",\n"
         << "  control_net_path: \"" << control_net_path << "\",\n"
@@ -718,7 +728,9 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         llm_vision_path.c_str(),
         diffusion_model_path.c_str(),
         high_noise_diffusion_model_path.c_str(),
+        embeddings_connectors_path.c_str(),
         vae_path.c_str(),
+        audio_vae_path.c_str(),
         taesd_path.c_str(),
         control_net_path.c_str(),
         embedding_vec.data(),
@@ -990,6 +1002,11 @@ ArgOptions SDGenerationParams::get_options() {
          "process vae in tiles to reduce memory usage",
          true,
          &vae_tiling_params.enabled},
+        {"",
+         "--temporal-tiling",
+         "enable temporal tiling for LTX video VAE decode",
+         true,
+         &vae_tiling_params.temporal_tiling},
         {"",
          "--hires",
          "enable highres fix",
@@ -1682,6 +1699,9 @@ bool SDGenerationParams::from_json_str(
         if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) {
             vae_tiling_params.enabled = tiling_json["enabled"];
         }
+        if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) {
+            vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"];
+        }
         if (tiling_json.contains("tile_size_x") && tiling_json["tile_size_x"].is_number_integer()) {
             vae_tiling_params.tile_size_x = tiling_json["tile_size_x"];
         }
@@ -2187,6 +2207,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
     params.strength                 = strength;
     params.seed                     = seed;
     params.video_frames             = video_frames;
+    params.fps                      = fps;
     params.vace_strength            = vace_strength;
     params.vae_tiling_params        = vae_tiling_params;
     params.cache                    = cache_params;
@@ -2275,6 +2296,7 @@ std::string SDGenerationParams::to_string() const {
         << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
         << "  vae_tiling_params: { "
         << vae_tiling_params.enabled << ", "
+        << vae_tiling_params.temporal_tiling << ", "
         << vae_tiling_params.tile_size_x << ", "
         << vae_tiling_params.tile_size_y << ", "
         << vae_tiling_params.target_overlap << ", "

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -92,7 +92,9 @@ struct SDContextParams {
     std::string llm_vision_path;
     std::string diffusion_model_path;
     std::string high_noise_diffusion_model_path;
+    std::string embeddings_connectors_path;
     std::string vae_path;
+    std::string audio_vae_path;
     std::string taesd_path;
     std::string esrgan_path;
     std::string control_net_path;
@@ -183,7 +185,7 @@ struct SDGenerationParams {
     int video_frames                     = 1;
     int fps                              = 16;
     float vace_strength                  = 1.f;
-    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
 
     std::string pm_id_images_dir;
     std::string pm_id_embed_path;