Mistoline flux controlnet support.

2025-08-02 15:04:50 +08:00 · 2024-09-05 00:04:52 -04:00
parent c7427375ee
commit 5cbaa9e07c
2 changed files with 85 additions and 30 deletions
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@@ -1,4 +1,5 @@
 #Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
+#modified to support different types of flux controlnets

 import torch
 import math
@@ -12,22 +13,65 @@ from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
 from .model import Flux
 import comfy.ldm.common_dit

+class MistolineCondDownsamplBlock(nn.Module):
+    def __init__(self, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
+        )
+
+    def forward(self, x):
+        return self.encoder(x)
+
+class MistolineControlnetBlock(nn.Module):
+    def __init__(self, hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.linear = operations.Linear(hidden_size, hidden_size, dtype=dtype, device=device)
+        self.act = nn.SiLU()
+
+    def forward(self, x):
+        return self.act(self.linear(x))
+

 class ControlNetFlux(Flux):
-    def __init__(self, latent_input=False, num_union_modes=0, image_model=None, dtype=None, device=None, operations=None, **kwargs):
+    def __init__(self, latent_input=False, num_union_modes=0, mistoline=False, image_model=None, dtype=None, device=None, operations=None, **kwargs):
        super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)

        self.main_model_double = 19
        self.main_model_single = 38
+
+        self.mistoline = mistoline
        # add ControlNet blocks
+        if self.mistoline:
+            control_block = lambda : MistolineControlnetBlock(self.hidden_size, dtype=dtype, device=device, operations=operations)
+        else:
+            control_block = lambda : operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
+
        self.controlnet_blocks = nn.ModuleList([])
        for _ in range(self.params.depth):
-            controlnet_block = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
-            self.controlnet_blocks.append(controlnet_block)
+            self.controlnet_blocks.append(control_block())

        self.controlnet_single_blocks = nn.ModuleList([])
        for _ in range(self.params.depth_single_blocks):
-            self.controlnet_single_blocks.append(operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device))
+            self.controlnet_single_blocks.append(control_block())

        self.num_union_modes = num_union_modes
        self.controlnet_mode_embedder = None
@@ -38,23 +82,26 @@ class ControlNetFlux(Flux):
        self.latent_input = latent_input
        self.pos_embed_input = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
        if not self.latent_input:
-            self.input_hint_block = nn.Sequential(
-                operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
-            )
+            if self.mistoline:
+                self.input_cond_block = MistolineCondDownsamplBlock(dtype=dtype, device=device, operations=operations)
+            else:
+                self.input_hint_block = nn.Sequential(
+                    operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
+                )

    def forward_orig(
        self,
@@ -73,9 +120,6 @@ class ControlNetFlux(Flux):

        # running on sequences img
        img = self.img_in(img)
-        if not self.latent_input:
-            controlnet_cond = self.input_hint_block(controlnet_cond)
-            controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)

        controlnet_cond = self.pos_embed_input(controlnet_cond)
        img = img + controlnet_cond
@@ -131,9 +175,14 @@ class ControlNetFlux(Flux):
        patch_size = 2
        if self.latent_input:
            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
-            hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+        elif self.mistoline:
+            hint = hint * 2.0 - 1.0
+            hint = self.input_cond_block(hint)
        else:
            hint = hint * 2.0 - 1.0
+            hint = self.input_hint_block(hint)
+
+        hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)

        bs, c, h, w = x.shape
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))