Implement Cosmos Image/Video to World (Video) diffusion models.

Use CosmosImageToVideoLatent to set the input image/video.
2025-08-02 23:14:49 +08:00 · 2025-01-14 05:14:10 -05:00
parent 1f1c7b7b56
commit 3aaabb12d4
6 changed files with 84 additions and 12 deletions
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -534,7 +534,7 @@ class VAE:
    def encode(self, pixel_samples):
        pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
        pixel_samples = pixel_samples.movedim(-1, 1)
-        if self.latent_dim == 3:
+        if self.latent_dim == 3 and pixel_samples.ndim < 5:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
        try:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)