Wan 2.2 support. (#9080)

2025-08-02 23:14:49 +08:00 · 2025-07-28 05:00:23 -07:00
parent d0210fe2e5
commit a88788dce6
8 changed files with 926 additions and 19 deletions
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -685,6 +685,49 @@ class WanTrackToVideo:
        out_latent["samples"] = latent
        return (positive, negative, out_latent)

+
+class Wan22ImageToVideoLatent:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"vae": ("VAE", ),
+                             "width": ("INT", {"default": 1280, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}),
+                             "height": ("INT", {"default": 704, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}),
+                             "length": ("INT", {"default": 49, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                },
+                "optional": {"start_image": ("IMAGE", ),
+                }}
+
+
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning/inpaint"
+
+    def encode(self, vae, width, height, length, batch_size, start_image=None):
+        latent = torch.zeros([1, 48, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device())
+
+        if start_image is None:
+            out_latent = {}
+            out_latent["samples"] = latent
+            return (out_latent,)
+
+        mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
+
+        if start_image is not None:
+            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            latent_temp = vae.encode(start_image)
+            latent[:, :, :latent_temp.shape[-3]] = latent_temp
+            mask[:, :, :latent_temp.shape[-3]] *= 0.0
+
+        out_latent = {}
+        latent_format = comfy.latent_formats.Wan22()
+        latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask)
+        out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1))
+        out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1))
+        return (out_latent,)
+
+
 NODE_CLASS_MAPPINGS = {
    "WanTrackToVideo": WanTrackToVideo,
    "WanImageToVideo": WanImageToVideo,
@@ -695,4 +738,5 @@ NODE_CLASS_MAPPINGS = {
    "TrimVideoLatent": TrimVideoLatent,
    "WanCameraImageToVideo": WanCameraImageToVideo,
    "WanPhantomSubjectToVideo": WanPhantomSubjectToVideo,
+    "Wan22ImageToVideoLatent": Wan22ImageToVideoLatent,
 }